%ls data
archive.zip olist_orders_dataset.csv olist_customers_dataset.csv olist_products_dataset.csv olist_geolocation_dataset.csv olist_products_dataset_po.csv olist_order_items_dataset.csv olist_sellers_dataset.csv olist_order_payments_dataset.csv product_category_name_translation.csv olist_order_reviews_dataset.csv
# Importing Libraries
# Standard libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime
from matplotlib.gridspec import GridSpec
pd.set_option('display.max_columns', 100)
import plotly.offline as py
import plotly.express as px
import plotly.graph_objs as go
import json
import requests
import folium
from folium.plugins import FastMarkerCluster, Fullscreen, MiniMap, HeatMap, HeatMapWithTime, LocateControl
import opendatasets as od
import calendar
from pandas.api.types import CategoricalDtype
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud
from collections import Counter
from PIL import Image
# # Utilities
# from viz_utils import *
# from custom_transformers import *
# from ml_utils import *
# import tensorflow_hub as hub
# from object_detection.utils import visualization_utils as viz_utils
# DataPrep
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib
# Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
geo_data = pd.read_csv('data/olist_geolocation_dataset.csv')
order_data = pd.read_csv('data/olist_orders_dataset.csv')
order_item_data = pd.read_csv('data/olist_order_items_dataset.csv')
products_data = pd.read_csv('data/olist_products_dataset.csv')
sellers_data = pd.read_csv('data/olist_sellers_dataset.csv')
payment_data = pd.read_csv('data/olist_order_payments_dataset.csv')
order_data.isnull().sum()
order_id 0 customer_id 0 order_status 0 order_purchase_timestamp 0 order_approved_at 160 order_delivered_carrier_date 1783 order_delivered_customer_date 2965 order_estimated_delivery_date 0 dtype: int64
결측값의 비율은 약 3%이므로 'order_delivered_customer_date'의 Null 값은 'order_estimated_delivery_date'로 채우기
## **결측값의 비율은 약 3%이므로 고객 날짜 기능의 Null 값은 forward-fill method로 채우기**
# order_data['order_delivered_customer_date']= order_data['order_delivered_customer_date'].fillna(method='ffill')
# order_data['order_delivered_customer_date'].isnull().sum()
order_data['order_delivered_customer_date']= order_data['order_delivered_customer_date'].fillna(order_data['order_estimated_delivery_date'])
order_data['order_delivered_customer_date'].isnull().sum()
0
배송 완료 날짜 및 예상 날짜 컬럼 생성
order_data['delivered_time']=pd.to_datetime(order_data['order_delivered_customer_date'],
format='%Y-%m-%d').dt.date
order_data['estimate_time']=pd.to_datetime(order_data['order_estimated_delivery_date'],
format='%Y-%m-%d').dt.date
주문 배송된 고객 날짜를 기준으로 Weekly 생성
order_data['weekly']=pd.to_datetime(order_data['order_delivered_customer_date'],
format='%Y-%m-%d').dt.week
주문 배송된 고객 날짜를 기반으로 Monthly 생성
order_data['monthly']=pd.to_datetime(order_data['order_delivered_customer_date'])\
.dt.to_period('M')
order_data['monthly']= order_data['monthly'].astype(str)
예상 날짜와 배송된 날짜의 차이로 diff_days 생성
order_data['diff_days']= order_data['delivered_time']-order_data['estimate_time']
order_data['diff_days']= order_data['diff_days'].dt.days
# 배송완료된 주문제품만
order_data_delivery = order_data[order_data["order_status"]=='delivered']
# 배송완료 예정 날짜보다 늦게 배송된 경우
order_data_delivery[order_data_delivery['diff_days']>0].shape
(6534, 13)
# 배송완료 예정 날짜 = 실제 배송완료 날짜
order_data_delivery[order_data_delivery['diff_days']==0].shape
(1300, 13)
# 배송완료 예정 날짜보다 빨리 배송된 경우
order_data_delivery[order_data_delivery['diff_days']<0].shape
(88644, 13)
order_data_delivery.head(2)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | delivered_time | estimate_time | weekly | monthly | diff_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | 2017-10-10 | 2017-10-18 | 41 | 2017-10 | -8 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 | 2018-08-07 | 2018-08-13 | 32 | 2018-08 | -6 |
order_data_delivery.groupby(['weekly'])["diff_days"].mean()
weekly 1 -9.179291 2 -11.274967 3 -13.562369 4 -14.163656 5 -15.182986 6 -16.203322 7 -15.799630 8 -13.617605 9 -11.401540 10 -10.357715 11 -9.141724 12 -8.606580 13 -9.058068 14 -7.699257 15 -8.748567 16 -9.904279 17 -11.989136 18 -11.790677 19 -11.844112 20 -11.816672 21 -11.213804 22 -11.815772 23 -13.277978 24 -18.280579 25 -16.657393 26 -16.180823 27 -15.812530 28 -14.131258 29 -12.202691 30 -11.657437 31 -11.331565 32 -9.960604 33 -8.816615 34 -10.163465 35 -10.815978 36 -11.667949 37 -10.772212 38 -5.782814 39 -11.297828 40 -13.302050 41 -15.554979 42 -13.951473 43 -13.352159 44 -12.252772 45 -11.619243 46 -10.698646 47 -10.564319 48 -10.555990 49 -11.439655 50 -11.378305 51 -10.894824 52 -11.250749 Name: diff_days, dtype: float64
plt.figure(figsize=(20,10))
sns.lineplot(x='weekly', y='diff_days', data=order_data_delivery, color="coral", linewidth=5,
markers=True,dashes=False, estimator='mean')
plt.xlabel("Weeks", size=14)
plt.ylabel("Difference Days", size=14)
plt.title("Average Difference Days per Week",size=15, weight='bold')
Text(0.5, 1.0, 'Average Difference Days per Week')
order_data_delivery.groupby(['monthly'])["diff_days"].mean()
monthly 2016-10 -41.434146 2016-11 -20.379310 2016-12 -7.250000 2017-01 -30.056537 2017-02 -25.159141 2017-03 -14.372796 2017-04 -14.546241 2017-05 -12.490269 2017-06 -13.530397 2017-07 -12.523300 2017-08 -13.255463 2017-09 -10.343001 2017-10 -12.648865 2017-11 -10.765310 2017-12 -11.242992 2018-01 -11.997878 2018-02 -12.132479 2018-03 -7.889215 2018-04 -8.353758 2018-05 -11.275489 2018-06 -16.790630 2018-07 -13.676762 2018-08 -8.372865 2018-09 22.553571 2018-10 83.666667 Name: diff_days, dtype: float64
order_data_delivery1 = order_data_delivery
order_data_delivery1 = order_data_delivery.sort_values('monthly')
plt.figure(figsize=(20,10))
sns.lineplot(x='monthly', y='diff_days', data=order_data_delivery1, color="coral", linewidth=5,
markers=True,dashes=False,estimator='mean')
plt.xlabel("Months", size=14)
plt.ylabel("Difference Days", size=14)
plt.title("Average Difference Days per Month",size=15, weight='bold')
Text(0.5, 1.0, 'Average Difference Days per Month')
배송예상날짜와 배송완료날짜 차이를 판매자와 카테고리별로 보기 위해
df_orders = pd.merge(order_data, order_item_data, on="order_id",how = 'right')
df_orders = pd.merge(df_orders, products_data, on="product_id",how = 'right')
df_orders = df_orders.drop_duplicates(['order_id','product_id'], keep = 'first')
df_orders[df_orders.duplicated(['order_id','product_id'])]
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | delivered_time | estimate_time | weekly | monthly | diff_days | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm |
|---|
# 배송완료된 주문 제품만
df_orders_delivery = df_orders[df_orders["order_status"]=='delivered']
seller별 배송예상날짜와 실제배송날짜 차이 평균
df_orders_delivery.groupby(['seller_id'])["diff_days"].mean().sort_values()
seller_id
933446e9a59dece7ae9175103820ca8f -66.0
0b09101900100c0e9d312861fad5a1b9 -61.0
fa5fdc4e4bb6bd1009ad0e4ac4096562 -58.0
ffff564a4f9085cd26170f4732393726 -48.0
58e4b302b54937e55a678c4d15111da4 -48.0
...
6f1a1263039c76e68f40a8e536b1da6a 31.0
8629a7efec1aab257e58cda559f03ba7 33.0
4fb41dff7c50136976d1a5cf004a42e2 33.0
8e670472e453ba34a379331513d6aab1 35.0
df683dfda87bf71ac3fc63063fba369d 167.0
Name: diff_days, Length: 2970, dtype: float64
카테고리별 배송예상날짜와 실제배송날짜 차이 평균
df_orders.groupby(['product_category_name'])["diff_days"].mean().sort_values()
product_category_name
security_and_services -17.000000
la_cuisine -16.384615
cds_dvds_musicals -15.833333
fixed_telephony -14.562212
fashion_shoes -14.521236
...
home_confort -9.679803
food -9.626932
home_comfort_2 -8.708333
furniture_mattress_and_upholstery -6.973684
arts_and_craftmanship -6.791667
Name: diff_days, Length: 71, dtype: float64
df_orders_delivery['diff_days'].sort_values()
84882 -147
103127 -140
20985 -135
87161 -124
59215 -109
...
36542 166
95255 167
110636 175
1466 181
86906 188
Name: diff_days, Length: 100196, dtype: int64
# 첫 번째 그룹바이
df_agg = df_orders_delivery.groupby(['product_category_name','seller_id']).agg({'diff_days':'mean'})
df_agg
| diff_days | ||
|---|---|---|
| product_category_name | seller_id | |
| agro_industry_and_commerce | 06579cb253ecd5a3a12a9e6eb6bf8f47 | -10.000000 |
| 0ed6ce5d87fd9c69eaacaeb778d67235 | -9.000000 | |
| 2528744c5ef5d955adc318720a94d2e7 | -9.400000 | |
| 269cff2d3c8d205c11f37a52402ea93b | -9.000000 | |
| 31ae0774c17fabd06ff707cc5bde005f | -12.857143 | |
| ... | ... | ... |
| watches_gifts | f8db351d8c4c4c22c6835c19a46f01b0 | -11.200000 |
| f9eedec3129e8cc6b6429c42d0808c5b | -9.000000 | |
| fa1c13f2614d7b5c4749cbc52fecda94 | -10.309441 | |
| fcba71d4d5db057f43249145e4eb1e27 | -7.666667 | |
| ffc470761de7d0232558ba5e786e57b7 | -13.400000 |
6144 rows × 1 columns
df_agg1 = df_agg
df_agg1 = df_agg.sort_values('diff_days')
plt.figure(figsize=(20,8))
sns.lineplot(x='product_category_name', y='diff_days', data=df_agg1, color="coral", linewidth=4,
markers=True,dashes=False,estimator='mean')
plt.xticks(rotation=90)
plt.xlabel("Category", size=14)
plt.ylabel("Difference Days", size=14)
plt.title("Average Difference Days per Category",size=15, weight='bold')
Text(0.5, 1.0, 'Average Difference Days per Category')
카테고리별 셀러의 배송예상날짜와 실제배송날짜 차이 평균 순위
# 첫 번째 그룹바이
df_agg = df_orders_delivery.groupby(['product_category_name','seller_id']).agg({'diff_days':'mean'})
df_agg
| diff_days | ||
|---|---|---|
| product_category_name | seller_id | |
| agro_industry_and_commerce | 06579cb253ecd5a3a12a9e6eb6bf8f47 | -10.000000 |
| 0ed6ce5d87fd9c69eaacaeb778d67235 | -9.000000 | |
| 2528744c5ef5d955adc318720a94d2e7 | -9.400000 | |
| 269cff2d3c8d205c11f37a52402ea93b | -9.000000 | |
| 31ae0774c17fabd06ff707cc5bde005f | -12.857143 | |
| ... | ... | ... |
| watches_gifts | f8db351d8c4c4c22c6835c19a46f01b0 | -11.200000 |
| f9eedec3129e8cc6b6429c42d0808c5b | -9.000000 | |
| fa1c13f2614d7b5c4749cbc52fecda94 | -10.309441 | |
| fcba71d4d5db057f43249145e4eb1e27 | -7.666667 | |
| ffc470761de7d0232558ba5e786e57b7 | -13.400000 |
6144 rows × 1 columns
# 인덱스의 첫 번째 수준으로 그룹화
g = df_agg['diff_days'].groupby(level=0, group_keys=False)
# 각 그룹 내에서 diff_days 열을 내림차순으로 정렬하고 상위 3 개 행만 가져오기
#delivery_ = g.apply(lambda x: x.order(ascending=False).head(3))
delivery_fast = g.apply(lambda x: x.sort_values(ascending=False).tail(3))
delivery_fast
product_category_name seller_id
agro_industry_and_commerce 6b2612338467c08c9b25f0cc55b1578d -22.000000
85cc55e048b6bcc90c75afe7f0e72f72 -23.000000
55bb6fe5bebe5012e2d7d9928e1128ff -24.000000
air_conditioning ac6417315ae5da8f9a38ee11de6f7780 -28.000000
c7dcd301ecfe5ab7f778ac172cf74be7 -29.000000
...
toys e883aa812c37a7413226856f4579fe91 -37.000000
4bde6149c15cf7e177b36fa060dd6de8 -52.000000
watches_gifts 710e3548e02bc1d2831dfc4f1b5b14d4 -29.666667
747e73c01a4593d4cf60dcf970fd2689 -30.000000
9f505651f4a6abe901a56cdc21508025 -33.333333
Name: diff_days, Length: 209, dtype: float64
delivery_slow = g.apply(lambda x: x.sort_values(ascending=False).head(3))
delivery_slow
product_category_name seller_id
agro_industry_and_commerce 3d0cd21d41671c46f82cd11176bf7277 -2.000000
397c4d0c005b6f41f90098ac724e28cb -3.000000
6bd69102ab48df500790a8cecfc285c2 -3.600000
air_conditioning 15aac934c58d886785ac1b17953ea898 4.666667
b2ba3715d723d245138f291a6fe42594 -2.000000
...
toys 4c8b8048e33af2bf94f2eb547746a916 12.000000
bdb3edbaee43a761e2d4f258dc08f348 7.000000
watches_gifts 4fb41dff7c50136976d1a5cf004a42e2 33.000000
e7df4cd29ab5abab70fb0783ddb53987 6.000000
e82de6494d91d3c4c54450f59b227a94 3.000000
Name: diff_days, Length: 209, dtype: float64
# 한번에 보기
df_orders_delivery.groupby(['product_category_name']).apply(lambda x: (x.groupby('seller_id')
.mean()
.sort_values('diff_days', ascending=False))
.tail(3))
| weekly | diff_days | order_item_id | price | freight_value | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| product_category_name | seller_id | ||||||||||||
| agro_industry_and_commerce | 6b2612338467c08c9b25f0cc55b1578d | 9.000000 | -22.000000 | 1.0 | 92.900000 | 20.970000 | 40.000000 | 628.000000 | 1.000000 | 2650.0 | 52.0 | 13.000000 | 32.0 |
| 85cc55e048b6bcc90c75afe7f0e72f72 | 31.000000 | -23.000000 | 1.0 | 33.900000 | 14.400000 | 57.000000 | 980.000000 | 4.000000 | 650.0 | 46.0 | 4.000000 | 15.0 | |
| 55bb6fe5bebe5012e2d7d9928e1128ff | 22.500000 | -24.000000 | 1.0 | 98.500000 | 47.125000 | 56.000000 | 663.000000 | 1.500000 | 6125.0 | 99.0 | 7.500000 | 65.0 | |
| air_conditioning | ac6417315ae5da8f9a38ee11de6f7780 | 19.000000 | -28.000000 | 1.0 | 110.300000 | 14.540000 | 30.000000 | 547.000000 | 1.000000 | 2000.0 | 32.0 | 51.000000 | 16.0 |
| c7dcd301ecfe5ab7f778ac172cf74be7 | 43.000000 | -29.000000 | 1.0 | 299.990000 | 19.010000 | 41.000000 | 775.000000 | 4.000000 | 8600.0 | 29.0 | 50.000000 | 25.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| toys | e883aa812c37a7413226856f4579fe91 | 34.000000 | -37.000000 | 1.0 | 59.900000 | 10.500000 | 24.000000 | 155.000000 | 1.000000 | 500.0 | 28.0 | 4.000000 | 14.0 |
| 4bde6149c15cf7e177b36fa060dd6de8 | 41.000000 | -52.000000 | 1.0 | 79.900000 | 16.330000 | 63.000000 | 683.000000 | 6.000000 | 300.0 | 17.0 | 9.000000 | 16.0 | |
| watches_gifts | 710e3548e02bc1d2831dfc4f1b5b14d4 | 34.000000 | -29.666667 | 1.0 | 149.990000 | 15.586667 | 28.000000 | 321.000000 | 1.000000 | 800.0 | 22.0 | 14.000000 | 13.0 |
| 747e73c01a4593d4cf60dcf970fd2689 | 14.333333 | -30.000000 | 1.0 | 739.330000 | 41.256667 | 46.333333 | 424.333333 | 1.333333 | 438.0 | 21.0 | 14.333333 | 19.0 | |
| 9f505651f4a6abe901a56cdc21508025 | 21.666667 | -33.333333 | 1.0 | 21.656667 | 18.206667 | 60.000000 | 515.000000 | 2.000000 | 100.0 | 16.0 | 3.000000 | 11.0 |
209 rows × 12 columns
# 카테고리 종류 보기
df_orders_delivery['product_category_name'].unique().tolist()
['perfumery', 'art', 'sports_leisure', 'baby', 'housewares', 'musical_instruments', 'cool_stuff', 'furniture_decor', 'home_appliances', 'toys', 'bed_bath_table', 'construction_tools_safety', 'computers_accessories', 'health_beauty', 'luggage_accessories', 'garden_tools', 'office_furniture', 'auto', 'electronics', 'fashion_shoes', 'telephony', 'stationery', 'fashion_bags_accessories', 'computers', 'home_construction', 'watches_gifts', 'construction_tools_construction', 'pet_shop', 'small_appliances', 'agro_industry_and_commerce', nan, 'furniture_living_room', 'signaling_and_security', 'air_conditioning', 'consoles_games', 'books_general_interest', 'costruction_tools_tools', 'fashion_underwear_beach', 'fashion_male_clothing', 'kitchen_dining_laundry_garden_furniture', 'industry_commerce_and_business', 'fixed_telephony', 'construction_tools_lights', 'books_technical', 'home_appliances_2', 'party_supplies', 'drinks', 'market_place', 'la_cuisine', 'costruction_tools_garden', 'fashio_female_clothing', 'home_confort', 'audio', 'food_drink', 'music', 'food', 'tablets_printing_image', 'small_appliances_home_oven_and_coffee', 'fashion_sport', 'christmas_supplies', 'fashion_childrens_clothes', 'dvds_blu_ray', 'arts_and_craftmanship', 'books_imported', 'furniture_bedroom', 'cine_photo', 'diapers_and_hygiene', 'flowers', 'home_comfort_2', 'security_and_services', 'furniture_mattress_and_upholstery', 'cds_dvds_musicals']
df_orders_delivery['product_category_name_port'].unique().tolist()
['perfumaria', 'artes', 'esporte_lazer', 'bebes', 'utilidades_domesticas', 'instrumentos_musicais', 'cool_stuff', 'moveis_decoracao', 'eletrodomesticos', 'brinquedos', 'cama_mesa_banho', 'construcao_ferramentas_seguranca', 'informatica_acessorios', 'beleza_saude', 'malas_acessorios', 'ferramentas_jardim', 'moveis_escritorio', 'automotivo', 'eletronicos', 'fashion_calcados', 'telefonia', 'papelaria', 'fashion_bolsas_e_acessorios', 'pcs', 'casa_construcao', 'relogios_presentes', 'construcao_ferramentas_construcao', 'pet_shop', 'eletroportateis', 'agro_industria_e_comercio', nan, 'moveis_sala', 'sinalizacao_e_seguranca', 'climatizacao', 'consoles_games', 'livros_interesse_geral', 'construcao_ferramentas_ferramentas', 'fashion_underwear_e_moda_praia', 'fashion_roupa_masculina', 'moveis_cozinha_area_de_servico_jantar_e_jardim', 'industria_comercio_e_negocios', 'telefonia_fixa', 'construcao_ferramentas_iluminacao', 'livros_tecnicos', 'eletrodomesticos_2', 'artigos_de_festas', 'bebidas', 'market_place', 'la_cuisine', 'construcao_ferramentas_jardim', 'fashion_roupa_feminina', 'casa_conforto', 'audio', 'alimentos_bebidas', 'musica', 'alimentos', 'tablets_impressao_imagem', 'portateis_casa_forno_e_cafe', 'fashion_esporte', 'artigos_de_natal', 'fashion_roupa_infanto_juvenil', 'dvds_blu_ray', 'artes_e_artesanato', 'pc_gamer', 'livros_importados', 'moveis_quarto', 'cine_foto', 'fraldas_higiene', 'flores', 'casa_conforto_2', 'portateis_cozinha_e_preparadores_de_alimentos', 'seguros_e_servicos', 'moveis_colchao_e_estofado', 'cds_dvds_musicais']
#Merge data
total_orders=pd.merge(order_data, order_item_data)
product_orders=pd.merge(total_orders,products_data, on="product_id")
product_orders.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 112650 entries, 0 to 112649 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112650 non-null object 1 customer_id 112650 non-null object 2 order_status 112650 non-null object 3 order_purchase_timestamp 112650 non-null object 4 order_approved_at 112635 non-null object 5 order_delivered_carrier_date 111456 non-null object 6 order_delivered_customer_date 112650 non-null object 7 order_estimated_delivery_date 112650 non-null object 8 delivered_time 112650 non-null object 9 estimate_time 112650 non-null object 10 weekly 112650 non-null int64 11 monthly 112650 non-null object 12 diff_days 112650 non-null int64 13 order_item_id 112650 non-null int64 14 product_id 112650 non-null object 15 seller_id 112650 non-null object 16 shipping_limit_date 112650 non-null object 17 price 112650 non-null float64 18 freight_value 112650 non-null float64 19 product_category_name 111023 non-null object 20 product_category_name_port 111047 non-null object 21 product_name_lenght 111047 non-null float64 22 product_description_lenght 111047 non-null float64 23 product_photos_qty 111047 non-null float64 24 product_weight_g 112632 non-null float64 25 product_length_cm 112632 non-null float64 26 product_height_cm 112632 non-null float64 27 product_width_cm 112632 non-null float64 dtypes: float64(9), int64(3), object(16) memory usage: 24.9+ MB
product_id 값 이름 줄이기
len(product_orders['product_id'].unique())
32951
len(product_orders['product_id'].str[-8:].unique())
32951
product_orders['product_id_shorten']=product_orders['product_id'].str[-8:]
#Plotting Top 10 Products
plt.figure(figsize=(20,10))
sns.countplot(x='product_id_shorten', data=product_orders, palette='gist_earth',
order=product_orders['product_id_shorten'].value_counts()[:10]\
.sort_values().index).set_title("Top 10 Products", fontsize=15,
weight='bold')
Text(0.5, 1.0, 'Top 10 Products')
Top 10 Product의 카테고리
group_category= product_orders.groupby(['product_id_shorten','product_category_name',])['product_id_shorten']\
.count().sort_values(ascending=False).head(10)
group_category
product_id_shorten product_category_name 314663af furniture_decor 527 339b6058 bed_bath_table 488 70e7f83d garden_tools 484 499d9c6b garden_tools 392 97a372db garden_tools 388 f1519f73 garden_tools 373 c61f2ac4 computers_accessories 343 d6772e08 watches_gifts 323 2e5804a6 health_beauty 281 c1e95ad7 computers_accessories 274 Name: product_id_shorten, dtype: int64
product_orders.groupby(["product_category_name"])["product_id_shorten"].count().sort_values(ascending=False).head(10)
product_category_name bed_bath_table 11115 health_beauty 9670 sports_leisure 8641 furniture_decor 8334 computers_accessories 7827 housewares 6964 watches_gifts 5991 telephony 4545 garden_tools 4347 auto 4235 Name: product_id_shorten, dtype: int64
plt.figure(figsize=(10,6))
sns.set_style("whitegrid")
ax = product_orders.product_category_name.value_counts().sort_values(ascending=False)[0:15].plot(kind='bar', color = 'grey', alpha=0.8)
ax.set_title("Top selling product categories")
ax.set_xlabel("States")
plt.xticks(rotation=90)
ax.set_ylabel("No of Orders")
plt.show()
#First seller dataset will be merged with the product orders data.
seller_products = pd.merge(product_orders, sellers_data, on="seller_id")
seller_products.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 112650 entries, 0 to 112649 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 112650 non-null object 1 customer_id 112650 non-null object 2 order_status 112650 non-null object 3 order_purchase_timestamp 112650 non-null object 4 order_approved_at 112635 non-null object 5 order_delivered_carrier_date 111456 non-null object 6 order_delivered_customer_date 112650 non-null object 7 order_estimated_delivery_date 112650 non-null object 8 delivered_time 112650 non-null object 9 estimate_time 112650 non-null object 10 weekly 112650 non-null int64 11 monthly 112650 non-null object 12 diff_days 112650 non-null int64 13 order_item_id 112650 non-null int64 14 product_id 112650 non-null object 15 seller_id 112650 non-null object 16 shipping_limit_date 112650 non-null object 17 price 112650 non-null float64 18 freight_value 112650 non-null float64 19 product_category_name 111023 non-null object 20 product_category_name_port 111047 non-null object 21 product_name_lenght 111047 non-null float64 22 product_description_lenght 111047 non-null float64 23 product_photos_qty 111047 non-null float64 24 product_weight_g 112632 non-null float64 25 product_length_cm 112632 non-null float64 26 product_height_cm 112632 non-null float64 27 product_width_cm 112632 non-null float64 28 product_id_shorten 112650 non-null object 29 seller_zip_code_prefix 112650 non-null int64 30 seller_city 112650 non-null object 31 seller_state 112650 non-null object dtypes: float64(9), int64(4), object(19) memory usage: 28.4+ MB
seller id 길이 줄이기
len(seller_products['seller_id'].unique())
3095
len(seller_products['seller_id'].str[-6:].unique())
3095
seller_products['seller_id_shorten']=seller_products['seller_id'].str[-6:]
seller_products.groupby(['seller_id_shorten'])['seller_id_shorten'].count().sort_values(ascending=False).head(10)
seller_id_shorten 7e94c0 2033 493884 1987 523100 1931 b7556a 1775 dab84a 1551 80ce60 1499 0e0bfa 1428 b010ab 1364 f13abc 1203 3ad736 1171 Name: seller_id_shorten, dtype: int64
Top 10 Seller
plt.figure(figsize=(20,10))
seller_products['seller_id_shorten'].value_counts()[:10].plot.pie(autopct='%1.1f%%',
shadow=True, startangle=90, cmap='tab20')
plt.title("Top 10 Seller",size=14, weight='bold')
Text(0.5, 1.0, 'Top 10 Seller')
seller_category= seller_products.groupby(['seller_id_shorten', 'product_category_name'])\
['seller_id_shorten'].count().sort_values(ascending=False).head(10)
seller_category
seller_id_shorten product_category_name 523100 garden_tools 1882 7e94c0 watches_gifts 1628 493884 bed_bath_table 1572 0e0bfa furniture_decor 1292 dab84a bed_bath_table 1277 b010ab office_furniture 1233 f13abc telephony 1178 b7556a health_beauty 1091 3ad736 cool_stuff 1069 3b52b2 watches_gifts 1002 Name: seller_id_shorten, dtype: int64
# seller_products.groupby(['seller_id_shorten']).apply(lambda x: (x.groupby('product_category_name')
# .count()
# .sort_values('seller_id_shorten', ascending=False))
# .head(3))
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,15))
group_category.plot.barh(ax=ax1, cmap='summer')
seller_category.plot.barh(ax=ax2, cmap='autumn')
ax1.set_title('Top10 Product', fontweight='bold')
ax2.set_title('Top10 Seller', fontweight='bold')
ax1.set_xlabel('Count', fontsize=15)
ax1.set_ylabel('Product Name', fontsize=15)
ax1.xaxis.set_tick_params(labelsize=12)
ax1.yaxis.set_tick_params(labelsize=15)
ax2.set_xlabel('Count', fontsize=15)
ax2.set_ylabel('Product Name', fontsize=15)
ax2.xaxis.set_tick_params(labelsize=12)
ax2.yaxis.set_tick_params(labelsize=15)
# First creating new feature of weeks
product_orders['order_week']= pd.to_datetime(product_orders['order_purchase_timestamp'],
format='%Y-%m-%d').dt.isocalendar().week
plt.figure(figsize=(20,12))
sns.countplot('order_week', data=product_orders[product_orders['product_id_shorten']==
'314663af'], palette='Dark2', linewidth=5)\
.set_title('Weekly Purchased Number of `314663af`'
,fontweight='bold')
Text(0.5, 1.0, 'Weekly Purchased Number of `314663af`')
product_orders['order_month']=pd.to_datetime(product_orders['order_purchase_timestamp'])\
.dt.to_period('M')
product_orders['order_month']= product_orders['order_month'].astype(str)
product_orders = product_orders.sort_values('order_month')
plt.figure(figsize=(20,12))
sns.countplot('order_month', data=product_orders[product_orders['product_id_shorten']==
'314663af'], palette='Dark2', linewidth=5)\
.set_title('Monthly Purchased Number of `314663af`'
,fontweight='bold')
Text(0.5, 1.0, 'Monthly Purchased Number of `314663af`')
#Weekly popular items
items_weekly =product_orders.groupby(['order_week','product_category_name'])\
['product_category_name'].count().sort_values(ascending=False)
#Change to data frame
most_products= items_weekly.reset_index(name='count')
#Find the max value of row
max_selling_products= most_products[most_products['count']
== most_products.groupby(['order_week'])\
['count'].transform(max)]
max_selling_products.head(10)
| order_week | product_category_name | count | |
|---|---|---|---|
| 0 | 47 | bed_bath_table | 438 |
| 2 | 33 | bed_bath_table | 355 |
| 3 | 29 | bed_bath_table | 339 |
| 4 | 32 | health_beauty | 330 |
| 5 | 8 | computers_accessories | 330 |
| 8 | 31 | health_beauty | 320 |
| 9 | 9 | computers_accessories | 314 |
| 10 | 19 | health_beauty | 310 |
| 11 | 18 | health_beauty | 302 |
| 12 | 23 | bed_bath_table | 301 |
max_selling_products['product_category_name'].value_counts()
bed_bath_table 33 health_beauty 10 sports_leisure 3 computers_accessories 3 furniture_decor 2 toys 1 Name: product_category_name, dtype: int64
#Monthly popular items
items_monthly =product_orders.groupby(['order_month','product_category_name'])\
['product_category_name'].count().sort_values(ascending=False)
#Change to data frame
most_products= items_monthly.reset_index(name='count')
#Find the max value of row
max_selling_products= most_products[most_products['count']
== most_products.groupby(['order_month'])\
['count'].transform(max)]
max_selling_products.head(10)
| order_month | product_category_name | count | |
|---|---|---|---|
| 0 | 2018-02 | computers_accessories | 981 |
| 1 | 2017-11 | bed_bath_table | 975 |
| 2 | 2018-06 | health_beauty | 885 |
| 3 | 2018-08 | health_beauty | 849 |
| 4 | 2018-01 | bed_bath_table | 848 |
| 5 | 2018-03 | bed_bath_table | 798 |
| 6 | 2018-07 | health_beauty | 784 |
| 9 | 2018-05 | health_beauty | 770 |
| 13 | 2018-04 | bed_bath_table | 746 |
| 35 | 2017-07 | bed_bath_table | 579 |
max_selling_products['product_category_name'].value_counts()
bed_bath_table 12 health_beauty 5 furniture_decor 4 fashion_bags_accessories 1 kitchen_dining_laundry_garden_furniture 1 computers_accessories 1 Name: product_category_name, dtype: int64
지불 금액(payment value)과 총 주문 금액(total order value)으로 총 이익(Gross Profit)과 이익 마진(Profit Margin)을 계산 가능
payments=pd.merge(seller_products, payment_data, on="order_id")
payments.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 117601 entries, 0 to 117600 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 order_id 117601 non-null object 1 customer_id 117601 non-null object 2 order_status 117601 non-null object 3 order_purchase_timestamp 117601 non-null object 4 order_approved_at 117586 non-null object 5 order_delivered_carrier_date 116356 non-null object 6 order_delivered_customer_date 117601 non-null object 7 order_estimated_delivery_date 117601 non-null object 8 delivered_time 117601 non-null object 9 estimate_time 117601 non-null object 10 weekly 117601 non-null int64 11 monthly 117601 non-null object 12 diff_days 117601 non-null int64 13 order_item_id 117601 non-null int64 14 product_id 117601 non-null object 15 seller_id 117601 non-null object 16 shipping_limit_date 117601 non-null object 17 price 117601 non-null float64 18 freight_value 117601 non-null float64 19 product_category_name 115878 non-null object 20 product_category_name_port 115903 non-null object 21 product_name_lenght 115903 non-null float64 22 product_description_lenght 115903 non-null float64 23 product_photos_qty 115903 non-null float64 24 product_weight_g 117581 non-null float64 25 product_length_cm 117581 non-null float64 26 product_height_cm 117581 non-null float64 27 product_width_cm 117581 non-null float64 28 product_id_shorten 117601 non-null object 29 seller_zip_code_prefix 117601 non-null int64 30 seller_city 117601 non-null object 31 seller_state 117601 non-null object 32 seller_id_shorten 117601 non-null object 33 payment_sequential 117601 non-null int64 34 payment_type 117601 non-null object 35 payment_installments 117601 non-null int64 36 payment_value 117601 non-null float64 dtypes: float64(10), int64(6), object(21) memory usage: 34.1+ MB
#First drop irrelevant columns that to make more handy to data
payments= payments.drop(columns=['product_name_lenght','product_description_lenght',
'product_photos_qty','product_weight_g','product_length_cm',
'product_height_cm','product_width_cm'])
price_details= payments.groupby(['order_id','price','product_category_name',
'monthly','weekly'])[['freight_value','payment_value']].sum().reset_index()
total_order_value = price + freight_value
price_details['total_order_value'] = price_details['price'] + price_details['freight_value']
price_details['gross_profit'] = price_details['payment_value']- price_details['total_order_value']
price_details['profit_margin'] = price_details['gross_profit']/price_details['payment_value']
price_details['profit_margin'] = price_details['profit_margin'].astype('int64')
price_details.sort_values('gross_profit', ascending=False).head(10)
| order_id | price | product_category_name | monthly | weekly | freight_value | payment_value | total_order_value | gross_profit | profit_margin | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1466 | 03caa2c082116e1d31e67e9ae3700499 | 1680.00 | fixed_telephony | 2017-10 | 42 | 224.08 | 109312.64 | 1904.08 | 107408.56 | 0 |
| 66507 | ab14fdcfbe524636d65ee38360e22ce8 | 98.70 | auto | 2017-09 | 35 | 288.80 | 45256.00 | 387.50 | 44868.50 | 0 |
| 10547 | 1b15974a0141d54e36626dca3fdc731a | 100.00 | computers_accessories | 2018-03 | 10 | 202.40 | 44048.00 | 302.40 | 43745.60 | 0 |
| 17258 | 2cc9089445046817a7539d90805e6e5a | 989.10 | agro_industry_and_commerce | 2017-12 | 50 | 146.94 | 36489.24 | 1136.04 | 35353.20 | 0 |
| 90919 | e8fa22c3673b1dd17ea315021b1f0f61 | 284.99 | drinks | 2018-05 | 18 | 168.70 | 30186.00 | 453.69 | 29732.31 | 0 |
| 44981 | 736e1922ae60d0d6a89247b851902527 | 1790.00 | fixed_telephony | 2018-07 | 30 | 114.88 | 29099.52 | 1904.88 | 27194.64 | 0 |
| 25839 | 428a2f660dc84138d969ccd69a0ab6d5 | 65.49 | furniture_decor | 2017-12 | 50 | 243.30 | 18384.75 | 308.79 | 18075.96 | 0 |
| 22588 | 3a213fcdfe7d98be74ea0dc05a8b31ae | 108.00 | watches_gifts | 2018-01 | 4 | 186.24 | 17786.88 | 294.24 | 17492.64 | 0 |
| 96725 | f80549a97eb203e1566e026ab66f045b | 137.90 | computers_accessories | 2017-09 | 39 | 388.10 | 17671.00 | 526.00 | 17145.00 | 0 |
| 26442 | 4412d97cb2093633afa85f11db46316c | 120.00 | computers_accessories | 2018-06 | 23 | 83.99 | 15978.65 | 203.99 | 15774.66 | 0 |
price_details = price_details.sort_values('monthly')
plt.figure(figsize=(25,15))
sns.lineplot(x='monthly',y='gross_profit',
data=price_details[price_details['product_category_name']\
=='bed_bath_table'], label='bed_bath_table',color="green")
sns.lineplot(x='monthly', y='gross_profit',
data=price_details[price_details['product_category_name']\
=='health_beauty'], label='health_beauty', color="blue")
sns.lineplot(x='monthly', y='gross_profit',
data=price_details[price_details['product_category_name']\
=='sports_leisure'], label='sports_leisure', color="red")
sns.lineplot(x='monthly', y='gross_profit',
data=price_details[price_details['product_category_name']\
=='furniture_decor'], label='home_decoration', color="orange")
sns.lineplot(x='monthly', y='gross_profit',
data=price_details[price_details['product_category_name']\
=='computers_accessories'], label='Informatic_accessories', color="purple")
plt.title("Gross Profit of Top 5 Products (2016-2018)",fontweight='bold')
Text(0.5, 1.0, 'Gross Profit of Top 5 Products (2016-2018)')
seller_products.groupby(['order_id'])['order_id'].count().sort_values(ascending=False).head(10)
order_id 8272b63d03f5f79c56e9e4120aec44ef 21 1b15974a0141d54e36626dca3fdc731a 20 ab14fdcfbe524636d65ee38360e22ce8 20 9ef13efd6949e4573a18964dd1bbe7f5 15 428a2f660dc84138d969ccd69a0ab6d5 15 9bdc4d4c71aa1de4606060929dee888c 14 73c8ab38f07dc94389065f7eba4f297a 14 37ee401157a3a0b28c9c6d0ed8c3b24b 13 2c2a19b5703863c908512d135aa6accc 12 c05d6a79e55da72ca780ce90364abed9 12 Name: order_id, dtype: int64
seller_products[seller_products['order_id'] == '8272b63d03f5f79c56e9e4120aec44ef']
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | delivered_time | estimate_time | weekly | monthly | diff_days | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_id_shorten | seller_zip_code_prefix | seller_city | seller_state | seller_id_shorten | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 103540 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 1 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103541 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 12 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103542 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 13 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103543 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 14 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103544 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 15 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103545 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 16 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103546 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 17 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103547 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 18 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103548 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 19 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103549 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 20 | 270516a3f41dc035aa87d220228f844c | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 232.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 228f844c | 3813 | sao paulo | SP | 5a56e9 |
| 103550 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 2 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103551 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 3 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103552 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 4 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103553 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 5 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103554 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 6 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103555 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 7 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103556 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 8 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103557 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 9 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103558 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 10 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103559 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 11 | 05b515fdc76e888aada3c6d66c201dff | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 1.2 | 7.89 | health_beauty | beleza_saude | 45.0 | 231.0 | 3.0 | 800.0 | 21.0 | 4.0 | 15.0 | 6c201dff | 3813 | sao paulo | SP | 5a56e9 |
| 103560 | 8272b63d03f5f79c56e9e4120aec44ef | fc3d1daec319d62d49bfb5e1f83123e9 | delivered | 2017-07-16 18:19:25 | 2017-07-17 18:25:23 | 2017-07-20 15:45:53 | 2017-07-31 18:03:02 | 2017-07-28 00:00:00 | 2017-07-31 | 2017-07-28 | 31 | 2017-07 | 3 | 21 | 79ce45dbc2ea29b22b5a261bbb7b7ee7 | 2709af9587499e95e803a6498a5a56e9 | 2017-07-21 18:25:23 | 7.8 | 6.57 | health_beauty | beleza_saude | 27.0 | 152.0 | 2.0 | 1000.0 | 25.0 | 6.0 | 12.0 | bb7b7ee7 | 3813 | sao paulo | SP | 5a56e9 |
#reading datas
import time
start = time.time()
data = pd.read_csv("data/olist_customers_dataset.csv")
geo_data = pd.read_csv("data/olist_geolocation_dataset.csv")
order_itemdata = pd.read_csv("data/olist_order_items_dataset.csv")
pay_data = pd.read_csv("data/olist_order_payments_dataset.csv")
rev_data = pd.read_csv("data/olist_order_reviews_dataset.csv")
orders = pd.read_csv("data/olist_orders_dataset.csv")
order_prddata = pd.read_csv("data/olist_products_dataset.csv")
order_selldata = pd.read_csv("data/olist_sellers_dataset.csv")
end = time.time()
print("reading time: ",(end-start),"sec")
reading time: 3.7213330268859863 sec
#checking number of columns , column_names and no_of_rows
datasets = [data,geo_data,order_itemdata, pay_data, orders, order_prddata,rev_data,order_selldata]
titles = ["customers","geolocations","items", "payments", "orders", "products","reviews","sellers"]
info_df = pd.DataFrame({},)
info_df['dataset']= titles
info_df['no_of_columns']= [len(df.columns) for df in datasets ]
info_df['columns_name']= [', '.join(list(df.columns)) for df in datasets]
info_df['no_of_rows'] = [len(df) for df in datasets]
info_df.style.background_gradient(cmap='Greys')
| dataset | no_of_columns | columns_name | no_of_rows | |
|---|---|---|---|---|
| 0 | customers | 5 | customer_id, customer_unique_id, customer_zip_code_prefix, customer_city, customer_state | 99441 |
| 1 | geolocations | 5 | geolocation_zip_code_prefix, geolocation_lat, geolocation_lng, geolocation_city, geolocation_state | 1000163 |
| 2 | items | 7 | order_id, order_item_id, product_id, seller_id, shipping_limit_date, price, freight_value | 112650 |
| 3 | payments | 5 | order_id, payment_sequential, payment_type, payment_installments, payment_value | 103886 |
| 4 | orders | 8 | order_id, customer_id, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date | 99441 |
| 5 | products | 10 | product_id, product_category_name, product_category_name_port, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm | 32951 |
| 6 | reviews | 7 | review_id, order_id, review_score, review_comment_title, review_comment_message, review_creation_date, review_answer_timestamp | 99224 |
| 7 | sellers | 4 | seller_id, seller_zip_code_prefix, seller_city, seller_state | 3095 |
#checking dtypes
datasets = [data,geo_data,order_itemdata, pay_data, orders, order_prddata,rev_data,order_selldata]
titles = ["customers","geolocations","items", "payments", "orders", "products","reviews","sellers"]
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
new_df = pd.DataFrame({},)
new_df['dataset']= titles
new_df['numeric_features'] = [len((df.select_dtypes(include=numerics)).columns) for df in datasets]
new_df['num_features_name'] = [', '.join(list((df.select_dtypes(include=numerics)).columns)) for df in datasets]
new_df['object_features'] = [len((df.select_dtypes(include='object')).columns) for df in datasets]
new_df['objt_features_name'] = [', '.join(list((df.select_dtypes(include='object')).columns)) for df in datasets]
new_df['bool_features'] = [len((df.select_dtypes(include='bool')).columns) for df in datasets]
new_df.style.background_gradient(cmap='Greys')
| dataset | numeric_features | num_features_name | object_features | objt_features_name | bool_features | |
|---|---|---|---|---|---|---|
| 0 | customers | 1 | customer_zip_code_prefix | 4 | customer_id, customer_unique_id, customer_city, customer_state | 0 |
| 1 | geolocations | 3 | geolocation_zip_code_prefix, geolocation_lat, geolocation_lng | 2 | geolocation_city, geolocation_state | 0 |
| 2 | items | 3 | order_item_id, price, freight_value | 4 | order_id, product_id, seller_id, shipping_limit_date | 0 |
| 3 | payments | 3 | payment_sequential, payment_installments, payment_value | 2 | order_id, payment_type | 0 |
| 4 | orders | 0 | 8 | order_id, customer_id, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date | 0 | |
| 5 | products | 7 | product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm | 3 | product_id, product_category_name, product_category_name_port | 0 |
| 6 | reviews | 1 | review_score | 6 | review_id, order_id, review_comment_title, review_comment_message, review_creation_date, review_answer_timestamp | 0 |
| 7 | sellers | 1 | seller_zip_code_prefix | 3 | seller_id, seller_city, seller_state | 0 |
#checking no of null values
datasets = [data,geo_data,order_itemdata, pay_data, orders, order_prddata,rev_data,order_selldata]
titles = ["customers","geolocations","items", "payments", "orders", "products","reviews","sellers"]
info_df_n = pd.DataFrame({},)
info_df_n['dataset']= titles
#creating column of name of columns in the dataset
info_df_n['cols'] = [', '.join([col for col, null in df.isnull().sum().items() ]) for df in datasets]
#creating total number of columns in the dataset
info_df_n['cols_no']= [df.shape[1] for df in datasets]
#counting total null values
info_df_n['null_no']= [df.isnull().sum().sum() for df in datasets]
#creating total number of columns in the dataset with null-values
info_df_n['null_cols_no']= [len([col for col, null in df.isnull().sum().items() if null > 0]) for df in datasets]
#creating column of name of columns in the dataset with null-values
info_df_n['null_cols'] = [', '.join([col for col, null in df.isnull().sum().items() if null > 0]) for df in datasets]
info_df_n.style.background_gradient(cmap='Greys')
| dataset | cols | cols_no | null_no | null_cols_no | null_cols | |
|---|---|---|---|---|---|---|
| 0 | customers | customer_id, customer_unique_id, customer_zip_code_prefix, customer_city, customer_state | 5 | 0 | 0 | |
| 1 | geolocations | geolocation_zip_code_prefix, geolocation_lat, geolocation_lng, geolocation_city, geolocation_state | 5 | 0 | 0 | |
| 2 | items | order_id, order_item_id, product_id, seller_id, shipping_limit_date, price, freight_value | 7 | 0 | 0 | |
| 3 | payments | order_id, payment_sequential, payment_type, payment_installments, payment_value | 5 | 0 | 0 | |
| 4 | orders | order_id, customer_id, order_status, order_purchase_timestamp, order_approved_at, order_delivered_carrier_date, order_delivered_customer_date, order_estimated_delivery_date | 8 | 4908 | 3 | order_approved_at, order_delivered_carrier_date, order_delivered_customer_date |
| 5 | products | product_id, product_category_name, product_category_name_port, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm | 10 | 3071 | 9 | product_category_name, product_category_name_port, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm |
| 6 | reviews | review_id, order_id, review_score, review_comment_title, review_comment_message, review_creation_date, review_answer_timestamp | 7 | 145903 | 2 | review_comment_title, review_comment_message |
| 7 | sellers | seller_id, seller_zip_code_prefix, seller_city, seller_state | 4 | 0 | 0 |
rev_new = rev_data.drop(['review_comment_title','review_creation_date','review_id','review_answer_timestamp'],axis=1)
df = pd.merge(orders,pay_data, on="order_id")
df = df.merge(data, on="customer_id")
df = df.merge(order_itemdata, on="order_id")
df = df.merge(order_prddata, on="product_id")
df = df.merge(rev_new, on="order_id")
df.head(2)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | payment_sequential | payment_type | payment_installments | payment_value | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | review_score | review_comment_message | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | 1 | credit_card | 1 | 18.12 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 4 | Não testei o produto ainda, mas ele veio corre... |
| 1 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | 3 | voucher | 1 | 2.00 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 4 | Não testei o produto ainda, mas ele veio corre... |
print("Number of rows after merging:",len(df))
print("Number of columns after merging:",len(df.columns))
Number of rows after merging: 117329 Number of columns after merging: 33
df.isnull().sum()
order_id 0 customer_id 0 order_status 0 order_purchase_timestamp 0 order_approved_at 15 order_delivered_carrier_date 1235 order_delivered_customer_date 2471 order_estimated_delivery_date 0 payment_sequential 0 payment_type 0 payment_installments 0 payment_value 0 customer_unique_id 0 customer_zip_code_prefix 0 customer_city 0 customer_state 0 order_item_id 0 product_id 0 seller_id 0 shipping_limit_date 0 price 0 freight_value 0 product_category_name 1720 product_category_name_port 1695 product_name_lenght 1695 product_description_lenght 1695 product_photos_qty 1695 product_weight_g 20 product_length_cm 20 product_height_cm 20 product_width_cm 20 review_score 0 review_comment_message 67650 dtype: int64
#Handling missing values
index = (df[df['order_delivered_customer_date'].isnull() == True].index.values)
df["order_approved_at"].fillna(df["order_purchase_timestamp"], inplace=True)
df["order_delivered_customer_date"].fillna(df["order_estimated_delivery_date"], inplace=True)
#dropping order delivery carrier date
df.drop(labels='order_delivered_carrier_date',axis=1,inplace=True)
#checking the replaced values
df.order_estimated_delivery_date[index[0]]
'2018-03-14 00:00:00'
# Handling missing values of numerical features
df['product_weight_g'].fillna(df['product_weight_g'].median(),inplace=True)
df['product_length_cm'].fillna(df['product_length_cm'].median(),inplace=True)
df['product_height_cm'].fillna(df['product_height_cm'].median(),inplace=True)
df['product_width_cm'].fillna(df['product_width_cm'].median(),inplace=True)
#Handling missing values of text column
print("Percentage of null reviews :",(df.review_comment_message.isnull().sum()/len(df))*100 ,"%")
# filling null value of review comments with no_review
df['review_comment_message'].fillna('no_review',inplace=True)
Percentage of null reviews : 57.65837942878572 %
df.shape
(117329, 32)
dup_rows = df[df.duplicated(['order_id','product_id','customer_id','order_purchase_timestamp','order_delivered_customer_date','customer_unique_id','review_comment_message'])]
dup_rows
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_customer_date | order_estimated_delivery_date | payment_sequential | payment_type | payment_installments | payment_value | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | review_score | review_comment_message | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | 3 | voucher | 1 | 2.00 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 4 | Não testei o produto ainda, mas ele veio corre... |
| 2 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | 2 | voucher | 1 | 18.59 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 4 | Não testei o produto ainda, mas ele veio corre... |
| 31 | d543201a9b42a1402ff97e65b439a48b | 971bf8f42a9f8cb3ead257854905b454 | delivered | 2018-08-21 10:00:25 | 2018-08-21 10:50:54 | 2018-08-28 18:58:22 | 2018-09-10 00:00:00 | 1 | credit_card | 2 | 279.30 | 958ac7760e5484df1496ba5833ab6117 | 89117 | gaspar | SC | 2 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-08-23 10:50:54 | 116.90 | 22.75 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 5 | entrega no prazo, amei |
| 33 | 78037c5f4d9f1f7bab0207f22c4e92f1 | ec8f2dcb080511771e52af7ca42117f5 | delivered | 2018-01-02 17:56:52 | 2018-01-02 18:10:14 | 2018-01-16 17:12:37 | 2018-01-31 00:00:00 | 1 | credit_card | 2 | 261.46 | 60a1e88691ef46c62b52fb2b994d7675 | 25850 | paraiba do sul | RJ | 2 | 595fac2a385ac33a80bd5114aec74eb8 | ef0ace09169ac090589d85746e3e036f | 2018-01-08 18:10:14 | 117.90 | 12.83 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 2 | no_review |
| 41 | 9883744c6400cfad8a620ddb82b9793c | df7f0c302322f4a233a07fb9cbddb46c | delivered | 2018-05-02 10:19:23 | 2018-05-02 10:34:27 | 2018-05-08 21:06:05 | 2018-05-24 00:00:00 | 1 | credit_card | 2 | 271.24 | 7dd163b8e3d4e09daa9e16091dd6dd26 | 4002 | sao paulo | SP | 2 | 595fac2a385ac33a80bd5114aec74eb8 | ef0ace09169ac090589d85746e3e036f | 2018-05-08 10:31:25 | 119.90 | 15.72 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 5 | Fui muito bem atendida |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 117307 | 58edf3f5f282ab91e0f6a6f2b85af02a | 14945bc802d49b0f9f68f2ea8fd060c4 | delivered | 2018-05-07 09:01:58 | 2018-05-08 04:34:10 | 2018-05-14 18:52:56 | 2018-05-30 00:00:00 | 1 | boleto | 1 | 856.16 | 62b8761192c86f4ead06f4a433096a2b | 88370 | navegantes | SC | 4 | f5e585e188891ed071253bacfad60820 | 30829ded4523ab9224b93bc49a62c95f | 2018-05-14 04:32:04 | 194.80 | 19.24 | computers_accessories | informatica_acessorios | 38.0 | 511.0 | 1.0 | 100.0 | 16.0 | 8.0 | 12.0 | 5 | Adquiri 4 cartuchos 41 Canon e recebi somente ... |
| 117312 | 76931aa364051d5d57f6a23aa352956a | b477202dca6a3ad89828fdd5748dc849 | delivered | 2017-11-18 12:51:32 | 2017-11-18 13:06:23 | 2017-11-27 14:55:45 | 2017-12-04 00:00:00 | 1 | credit_card | 1 | 341.00 | dc9ccd8be2d922f8583d97933b06c5f4 | 13087 | campinas | SP | 2 | af16005fca813272caf59c432153949e | 82e0a475a88cc9595229d8029273f045 | 2017-11-24 13:06:23 | 75.90 | 9.35 | furniture_decor | moveis_decoracao | 55.0 | 1186.0 | 2.0 | 800.0 | 53.0 | 8.0 | 20.0 | 1 | COMPREI 04, MAS SO RECEBI 02 |
| 117313 | 76931aa364051d5d57f6a23aa352956a | b477202dca6a3ad89828fdd5748dc849 | delivered | 2017-11-18 12:51:32 | 2017-11-18 13:06:23 | 2017-11-27 14:55:45 | 2017-12-04 00:00:00 | 1 | credit_card | 1 | 341.00 | dc9ccd8be2d922f8583d97933b06c5f4 | 13087 | campinas | SP | 3 | af16005fca813272caf59c432153949e | 82e0a475a88cc9595229d8029273f045 | 2017-11-24 13:06:23 | 75.90 | 9.35 | furniture_decor | moveis_decoracao | 55.0 | 1186.0 | 2.0 | 800.0 | 53.0 | 8.0 | 20.0 | 1 | COMPREI 04, MAS SO RECEBI 02 |
| 117314 | 76931aa364051d5d57f6a23aa352956a | b477202dca6a3ad89828fdd5748dc849 | delivered | 2017-11-18 12:51:32 | 2017-11-18 13:06:23 | 2017-11-27 14:55:45 | 2017-12-04 00:00:00 | 1 | credit_card | 1 | 341.00 | dc9ccd8be2d922f8583d97933b06c5f4 | 13087 | campinas | SP | 4 | af16005fca813272caf59c432153949e | 82e0a475a88cc9595229d8029273f045 | 2017-11-24 13:06:23 | 75.90 | 9.35 | furniture_decor | moveis_decoracao | 55.0 | 1186.0 | 2.0 | 800.0 | 53.0 | 8.0 | 20.0 | 1 | COMPREI 04, MAS SO RECEBI 02 |
| 117325 | e8fd20068b9f7e6ec07068bb7537f781 | 609b9fb8cad4fe0c7b376f77c8ab76ad | delivered | 2017-08-10 21:21:07 | 2017-08-10 21:35:26 | 2017-08-23 15:36:29 | 2017-08-31 00:00:00 | 1 | credit_card | 10 | 748.24 | fb9310710003399b031add3e55f34719 | 3318 | sao paulo | SP | 2 | 0df37da38a30a713453b03053d60d3f7 | 218d46b86c1881d022bce9c68a7d4b15 | 2017-08-17 21:35:26 | 356.00 | 18.12 | sports_leisure | esporte_lazer | 53.0 | 1105.0 | 1.0 | 2800.0 | 28.0 | 23.0 | 23.0 | 4 | no_review |
15451 rows × 32 columns
#Deduplication of entries
df= df.drop_duplicates(subset={'order_id','product_id','customer_id','order_purchase_timestamp','order_delivered_customer_date', 'customer_unique_id','review_comment_message'}, keep='first', inplace=False)
df= df.reindex()
df
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_customer_date | order_estimated_delivery_date | payment_sequential | payment_type | payment_installments | payment_value | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | review_score | review_comment_message | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 | 1 | credit_card | 1 | 18.12 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 4 | Não testei o produto ainda, mas ele veio corre... |
| 3 | 128e10d95713541c87cd1a2e48201934 | a20e8105f23924cd00833fd87daa0831 | delivered | 2017-08-15 18:29:31 | 2017-08-15 20:05:16 | 2017-08-18 14:44:43 | 2017-08-28 00:00:00 | 1 | credit_card | 3 | 37.77 | 3a51803cc0d012c3b5dc8b7528cb05f7 | 3366 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-08-21 20:05:16 | 29.99 | 7.78 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 4 | Deveriam embalar melhor o produto. A caixa vei... |
| 4 | 0e7e841ddf8f8f2de2bad69267ecfbcf | 26c7ac168e1433912a51b924fbd34d34 | delivered | 2017-08-02 18:24:47 | 2017-08-02 18:43:15 | 2017-08-07 18:30:01 | 2017-08-15 00:00:00 | 1 | credit_card | 1 | 37.77 | ef0996a1a279c26e7ecbd737be23d235 | 2290 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-08-08 18:37:31 | 29.99 | 7.78 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 5 | Só achei ela pequena pra seis xícaras ,mais é ... |
| 5 | bfc39df4f36c3693ff3b63fcbea9e90a | 53904ddbea91e1e92b2b3f1d09a7af86 | delivered | 2017-10-23 23:26:46 | 2017-10-25 02:14:11 | 2017-11-07 18:04:59 | 2017-11-13 00:00:00 | 1 | boleto | 1 | 44.09 | e781fdcc107d13d865fc7698711cc572 | 88032 | florianopolis | SC | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-31 02:14:11 | 29.99 | 14.10 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 3 | no_review |
| 6 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 | 1 | boleto | 1 | 141.46 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 1 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 4 | Muito bom o produto. |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 117323 | 87b4c933f31145a28413b39d880ad6c3 | ddfdf5e9b2659e1fbd073404c9b762e0 | delivered | 2018-03-02 17:12:05 | 2018-03-02 17:49:24 | 2018-03-16 17:21:55 | 2018-03-21 00:00:00 | 1 | credit_card | 1 | 85.25 | 490c3c53ca927221bccbb00706afbcf9 | 2042 | sao paulo | SP | 1 | cbaf5898b92064cd5e399c7cff291d0a | 06a2c3af7b3aee5d69171b0e14f0ee87 | 2018-03-08 17:31:06 | 57.99 | 27.26 | health_beauty | beleza_saude | 50.0 | 1031.0 | 1.0 | 350.0 | 19.0 | 12.0 | 13.0 | 5 | Satisfação garantida |
| 117324 | e8fd20068b9f7e6ec07068bb7537f781 | 609b9fb8cad4fe0c7b376f77c8ab76ad | delivered | 2017-08-10 21:21:07 | 2017-08-10 21:35:26 | 2017-08-23 15:36:29 | 2017-08-31 00:00:00 | 1 | credit_card | 10 | 748.24 | fb9310710003399b031add3e55f34719 | 3318 | sao paulo | SP | 1 | 0df37da38a30a713453b03053d60d3f7 | 218d46b86c1881d022bce9c68a7d4b15 | 2017-08-17 21:35:26 | 356.00 | 18.12 | sports_leisure | esporte_lazer | 53.0 | 1105.0 | 1.0 | 2800.0 | 28.0 | 23.0 | 23.0 | 4 | no_review |
| 117326 | cfa78b997e329a5295b4ee6972c02979 | a2f7428f0cafbc8e59f20e1444b67315 | delivered | 2017-12-20 09:52:41 | 2017-12-20 10:09:52 | 2018-01-26 15:45:14 | 2018-01-18 00:00:00 | 1 | credit_card | 1 | 71.04 | a49e8e11e850592fe685ae3c64b40eca | 83870 | campo do tenente | PR | 1 | 3d2c44374ee42b3003a470f3e937a2ea | ce248b21cb2adc36282ede306b7660e5 | 2017-12-27 10:09:52 | 55.90 | 15.14 | musical_instruments | instrumentos_musicais | 34.0 | 372.0 | 2.0 | 300.0 | 16.0 | 6.0 | 12.0 | 1 | Ele não é um mini cajon, é um shaker, ou seja,... |
| 117327 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-17 15:08:01 | 2017-03-28 00:00:00 | 1 | credit_card | 3 | 85.08 | 6359f309b166b0196dbf7ad2ac62bb5a | 12209 | sao jose dos campos | SP | 1 | ac35486adb7b02598c182c2ff2e05254 | e24fc9fcd865784fb25705606fe3dfe7 | 2017-03-15 09:54:05 | 72.00 | 13.08 | health_beauty | beleza_saude | 50.0 | 1517.0 | 1.0 | 1175.0 | 22.0 | 13.0 | 18.0 | 5 | no_review |
| 117328 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-16 13:08:30 | 2018-04-03 00:00:00 | 1 | debit_card | 1 | 86.86 | 60350aa974b26ff12caad89e55993bd6 | 83750 | lapa | PR | 1 | 006619bbed68b000c8ba3f8725d5409e | ececbfcff9804a2d6b40f589df8eef2b | 2018-03-15 10:55:42 | 68.50 | 18.36 | health_beauty | beleza_saude | 45.0 | 569.0 | 1.0 | 150.0 | 16.0 | 7.0 | 15.0 | 5 | no_review |
101878 rows × 32 columns
print("Number of rows after dedublication:",len(df))
print("Number of columns after deduplication:",len(df.columns))
Number of rows after dedublication: 101878 Number of columns after deduplication: 32
# all time stamps are in object dtype as observed above converting it into datetime
df[['order_purchase_timestamp','order_approved_at','order_delivered_customer_date','order_estimated_delivery_date']]=df[['order_purchase_timestamp',
'order_approved_at','order_delivered_customer_date','order_estimated_delivery_date']].apply(pd.to_datetime)
df.isnull().sum()
order_id 0 customer_id 0 order_status 0 order_purchase_timestamp 0 order_approved_at 0 order_delivered_customer_date 0 order_estimated_delivery_date 0 payment_sequential 0 payment_type 0 payment_installments 0 payment_value 0 customer_unique_id 0 customer_zip_code_prefix 0 customer_city 0 customer_state 0 order_item_id 0 product_id 0 seller_id 0 shipping_limit_date 0 price 0 freight_value 0 product_category_name 1476 product_category_name_port 1454 product_name_lenght 1454 product_description_lenght 1454 product_photos_qty 1454 product_weight_g 0 product_length_cm 0 product_height_cm 0 product_width_cm 0 review_score 0 review_comment_message 0 dtype: int64
df.describe()
| payment_sequential | payment_installments | payment_value | customer_zip_code_prefix | order_item_id | price | freight_value | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | review_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 101878.000000 | 101878.000000 | 101878.000000 | 101878.000000 | 101878.000000 | 101878.000000 | 101878.000000 | 100424.000000 | 100424.000000 | 100424.000000 | 101878.000000 | 101878.000000 | 101878.000000 | 101878.000000 | 101878.000000 |
| mean | 1.023185 | 2.959010 | 161.433317 | 35114.638116 | 1.046153 | 124.206951 | 20.105371 | 48.877689 | 786.803543 | 2.239246 | 2088.402962 | 30.077828 | 16.398251 | 23.037103 | 4.075463 |
| std | 0.255513 | 2.746049 | 219.436793 | 29814.808990 | 0.276434 | 187.508494 | 15.871753 | 9.994362 | 650.346334 | 1.743548 | 3749.106887 | 16.086148 | 13.304270 | 11.744047 | 1.354606 |
| min | 1.000000 | 0.000000 | 0.010000 | 1003.000000 | 1.000000 | 0.850000 | 0.000000 | 5.000000 | 4.000000 | 1.000000 | 0.000000 | 7.000000 | 2.000000 | 6.000000 | 1.000000 |
| 25% | 1.000000 | 1.000000 | 61.290000 | 11320.000000 | 1.000000 | 40.000000 | 13.140000 | 43.000000 | 348.000000 | 1.000000 | 300.000000 | 18.000000 | 8.000000 | 15.000000 | 4.000000 |
| 50% | 1.000000 | 2.000000 | 105.810000 | 24360.000000 | 1.000000 | 78.720000 | 16.320000 | 52.000000 | 602.000000 | 2.000000 | 700.000000 | 25.000000 | 13.000000 | 20.000000 | 5.000000 |
| 75% | 1.000000 | 4.000000 | 179.800000 | 58900.000000 | 1.000000 | 139.000000 | 21.200000 | 57.000000 | 985.000000 | 3.000000 | 1800.000000 | 38.000000 | 20.000000 | 30.000000 | 5.000000 |
| max | 27.000000 | 24.000000 | 13664.080000 | 99990.000000 | 21.000000 | 6735.000000 | 409.680000 | 76.000000 | 3992.000000 | 20.000000 | 40425.000000 | 105.000000 | 105.000000 | 118.000000 | 5.000000 |
# checking the target variables i.e review score
df.review_score.value_counts()
5 58583 4 19538 1 11883 3 8502 2 3372 Name: review_score, dtype: int64
def partition(x):
if x < 3:
return 0
elif x == 3:
return 1
else :
return 2
df['review_score']=df['review_score'].map(lambda cw : partition(cw) )
# checking the review score now
df.review_score.value_counts()
2 78121 0 15255 1 8502 Name: review_score, dtype: int64
#counting the review score with 1 and 0
y_value_counts = df.review_score.value_counts()
#calculating the percentage of each review type
print("Total Positive Reviews :", y_value_counts[2], ", (", (y_value_counts[2]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")
print("Total Neutral Reviews :", y_value_counts[1], ", (", (y_value_counts[1]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")
print("Total Negative Reviews :", y_value_counts[0], ", (", (y_value_counts[0]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")
print('\n')
#plotting bar-plot and pie chart
%matplotlib inline
sns.set_style("whitegrid")
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.ylabel('Total Reviews')
plt.xlabel('Label')
plt.title('Negative Vs Neutral Vs Positive Reviews',color='dimgrey')
plt.xticks([10,10.10,10.20],['0','1','2'])
#creating bar plots
plt.bar(10,13621, color = '#2e4884', width = 0.07, alpha=0.7, label='negative', edgecolor='black')
plt.bar(10.10,7978, color = 'gray', width = 0.07, alpha=0.8, label='neutral', edgecolor='black')
plt.bar(10.20,74917, color = 'lightgray', width = 0.07, alpha=0.9, label='positive', edgecolor='black')
plt.legend()
plt.subplot(1,2,2)
# 각 영역의 비율과 이름을 ratio와 labels로 지정
ratio = [74917,7978,13621]
labels = ['Positive','Neutral', 'Negative']
# explode는 부채꼴이 파이 차트의 중심에서 벗어나는 정도를 설정
explode = (0, 0.05, 0.1)
# colors를 사용하면 각 영역의 색상을 자유롭게 지정
color={'#2e4884','lightgray','gray'}
# # wedgeprops는 부채꼴 영역의 스타일을 설정
# wedgeprops={'width': 0.7, 'edgecolor': 'w', 'linewidth': 5}
# autopct는 부채꼴 안에 표시될 숫자의 형식을 지정 -> 소수점 한자리까지 표시하도록 설정
# startangle는 부채꼴이 그려지는 시작 각도를 설정 : 디폴트는 0도 (양의 방향 x축)
# counterclock=False로 설정하면 시계 방향 순서로 부채꼴 영역이 표시
# shadow를 True로 설정하면, 파이 차트에 그림자가 표시
plt.pie(ratio,explode=explode ,colors=color,labels=labels, autopct='%1.1f%%',shadow=False, startangle=0,radius=1.5,labeldistance=1.1,textprops={'fontsize': 14},frame=True, )
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Pie Chart for review score',color='dimgrey')
plt.show()
Total Positive Reviews : 78121 , ( 328.8336069369028 %) Total Neutral Reviews : 8502 , ( 35.787346887233234 %) Total Negative Reviews : 15255 , ( 64.21265311276676 %)
#Correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(18,8))
sns.set(font_scale=1.3)
cmap = sns.light_palette("#2f3b39",as_cmap=True)
sns.heatmap(corr_matrix, cmap=cmap,annot=True)
plt.title(" Correlation Matrix of the features",fontsize=19)
plt.savefig('plot16.png', dpi=300, bbox_inches='tight')
plt.show()
#finding corr- values of the features with review_score
corr_matrix["review_score"].sort_values(ascending=False)
review_score 1.000000 product_photos_qty 0.020685 product_description_lenght 0.018756 payment_sequential -0.005381 product_name_lenght -0.006785 price -0.008417 product_width_cm -0.014278 customer_zip_code_prefix -0.015221 product_length_cm -0.016844 product_height_cm -0.018242 product_weight_g -0.026949 freight_value -0.033832 payment_installments -0.047267 payment_value -0.057074 order_item_id -0.102381 Name: review_score, dtype: float64
#checking unique ids
print("Total number of unique seller_id:",len((df.seller_id).unique()))
print("Total number of unique product_id:",len((df.product_id).unique()))
print("Total number of unique customer_id:",len((df.customer_unique_id).unique()))
Total number of unique seller_id: 3090 Total number of unique product_id: 32789 Total number of unique customer_id: 94720
plt.figure(figsize=(8,6))
sns.set_style("whitegrid")
plt.ylabel('No of Unique_Ids')
plt.xlabel('Different Unique_Ids')
plt.title('Total Unique Ids',color='dimgrey')
plt.xticks([10,10.25,10.50],['seller_id','product_id','customer_id'])
#creating bar plots
plt.bar(10,3022, color = 'grey', width = 0.25,alpha=0.7,label='seller_id',edgecolor='black')
plt.bar(10.25,31053, color = 'white', width = 0.25,alpha=0.8,label='product_id',edgecolor='black')
plt.bar(10.50,94087, color = '#2e4884', width = 0.25,alpha=0.9,label='customer_id',edgecolor='black')
plt.legend()
plt.show()
df.groupby('payment_type').size()
payment_type boleto 20190 credit_card 77382 debit_card 1580 voucher 2726 dtype: int64
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.ylabel('Total')
plt.xlabel('payment_type')
plt.title('Total payment_type',color='dimgrey')
plt.xticks([10,10.25,10.50,10.75],['debit_card','voucher','boleto','credit_card'])
#creating bar plots
plt.bar(10.75,73816, color = '#2e4884', width = 0.25,alpha=0.8,label='credit_card',edgecolor='black')
plt.bar(10.50,19345, color = '#d8d8d8', width = 0.25,label='boleto',edgecolor='black')
plt.bar(10.25,2604, color = 'white', width = 0.25,alpha=0.5,label='voucher',edgecolor='black')
plt.bar(10,1490, color = 'grey', width = 0.25,alpha=0.8,label='debit_card',edgecolor='black')
plt.legend()
plt.subplot(1,2,2)
s= [ 73816,19345, 2604,1490]
new = ['credit_card','boleto','voucher','debit_card']
explode = (0, 0, 0,0)
colours = {'credit_card': '#2e4884',
'boleto': '#d8d8d8',
'voucher': 'w',
'debit_card': 'grey'}
color ={'#2e4884','grey','#d8d8d8','w'}
plt.pie(s, explode=explode, labels=new,colors=[colours[key] for key in new] , autopct='%1.1f%%',shadow=False, startangle=70,radius=1.5,frame=True,textprops={'fontsize': 8})
plt.axis('equal')
plt.title('Pie Chart for Total payment_type',color='dimgrey')
plt.show()
Baleto ==> Boleto Bancário, 간단히 Boleto(영어: 티켓)라고 하는 것은 FEBRABAN에서 규제하는 브라질의 지불 방법으로, Brazil Federation of Banks 의 약자. ATM, 지점 시설 및 인터넷 뱅킹에서 지불할 수 있음. 은행, 우체국, 복권 대리점 및 일부 슈퍼마켓은 마감일까지.
대부분의 주문이 신용 카드를 사용하여 지불되고 두 번째로 많이 사용되는 지불 방법은 boleto
신용카드를 75.9%, baleto를 19.9%, 바우처와 체크카드는 4.2%를 사용.
temp = pd.DataFrame(df.groupby('payment_type')['review_score'].agg(lambda x: x.eq(2).sum())).reset_index()
# Pandas dataframe grouby count
temp['total'] = list(pd.DataFrame(df.groupby('payment_type')['review_score'].agg([('total','count'),('Avg','mean')]))['total'])
temp['Avg'] = list(pd.DataFrame(df.groupby('payment_type')['review_score'].agg([('total','count'),('Avg','mean')]))['Avg'])
#sorting dataframe
temp = temp.sort_values(by=['total'], ascending=True)
#Simplifing the plots using pareto plots
def pareto_plot(df, x=None, y=None, title=None, show_pct_y=False, pct_format='{0:.0%}'):
xlabel = x
ylabel = y
tmp = df.sort_values(y, ascending=False)
x = tmp[x].values
y = tmp[y].values
weights = y / y.sum()
cumsum = weights.cumsum()
fig, ax1 = plt.subplots(figsize=(10,6))
ax1.bar(x, y,color='#2e4884',edgecolor='black',alpha=0.9)
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
ax2 = ax1.twinx()
ax2.plot(x, cumsum, '-ro', alpha=0.5,color='black')
ax2.set_ylabel('', color='r')
ax2.tick_params('y', colors='r')
vals = ax2.get_yticks()
ax2.set_yticklabels(['{:,.2%}'.format(x) for x in vals])
# hide y-labels on right side
if not show_pct_y:
ax2.set_yticks([])
formatted_weights = [pct_format.format(x) for x in cumsum]
for i, txt in enumerate(formatted_weights):
ax2.annotate(txt, (x[i], cumsum[i]),fontsize=15)
if title:
plt.title(title,color='dimgrey',fontsize=15)
plt.tight_layout()
plt.show()
pareto_plot(temp,x='payment_type',y='total',title="Pareto Plot of counts of each payment type")
#Let us see how this categorical feature related with our target variable
plt.figure(figsize=(12,8))
p1=plt.barh(temp.payment_type,temp.total,color='grey',alpha=0.5)
p2=plt.barh(temp.payment_type,temp.review_score,color='#2e4884',alpha=0.9)
plt.title('Payment Types and user_counts',fontsize=15,color='dimgrey')
plt.ylabel('payment_types',fontsize=14)
plt.xlabel('Total',fontsize=14)
plt.legend((p1[0], p2[0]), ('total_reviews', 'positive_review by users'))
plt.show()

# State with the consumers count
plt.figure(figsize=(10,6))
sns.set_style("whitegrid")
ax = df.customer_state.value_counts().sort_values(ascending=False)[0:15].plot(kind='bar', color = 'grey', alpha=0.8)
ax.set_title("Top 15 consumer states of Brazil")
ax.set_xlabel("States")
plt.xticks(rotation=35)
ax.set_ylabel("Number of consumers")
plt.show()
#stacked bar plots matplotlib: https://matplotlib.org/gallery/lines_bars_and_markers/bar_stacked.html
def stack_plot(data, xtick, col2, col3='total'):
ind = np.arange(data.shape[0])
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, data[col3].values,color = 'grey',alpha=0.5)
p2 = plt.bar(ind, data[col2].values,color= '#2e4884',alpha=0.8)
plt.ylabel('Reviews')
plt.title('% of review_score ')
plt.xticks(ind-0.1, list(data[xtick].values), rotation=0)
plt.legend((p1[0], p2[0]), ('total_reviews', 'positive_review'))
plt.show()
# Count number of zeros in dataframe python: https://stackoverflow.com/a/51540521/4084039
temp_1 = pd.DataFrame(df.groupby('customer_state')['review_score'].agg(lambda x: x.eq(2).sum())).reset_index()
# Pandas dataframe grouby count: https://stackoverflow.com/a/19385591/4084039
temp_1['total'] = list(pd.DataFrame(df.groupby('customer_state')['review_score'].agg([('total','count'),('Avg','mean')]))['total'])
temp_1['Avg'] = list(pd.DataFrame(df.groupby('customer_state')['review_score'].agg([('total','count'),('Avg','mean')]))['Avg'])
temp_1= temp_1.rename(columns={'review_score':'positive_review'})
temp_1= temp_1.sort_values(by=['total'], ascending=False)
temp_1
| customer_state | positive_review | total | Avg | |
|---|---|---|---|---|
| 25 | SP | 33939 | 42872 | 1.663720 |
| 18 | RJ | 9256 | 13113 | 1.495005 |
| 10 | MG | 9313 | 11936 | 1.645191 |
| 22 | RS | 4373 | 5629 | 1.636170 |
| 17 | PR | 4094 | 5141 | 1.669325 |
| 23 | SC | 2824 | 3712 | 1.612069 |
| 4 | BA | 2432 | 3444 | 1.515099 |
| 6 | DF | 1687 | 2205 | 1.609977 |
| 8 | GO | 1572 | 2087 | 1.600862 |
| 7 | ES | 1563 | 2072 | 1.598938 |
| 15 | PE | 1253 | 1674 | 1.577061 |
| 5 | CE | 952 | 1361 | 1.496694 |
| 13 | PA | 697 | 992 | 1.502016 |
| 12 | MT | 716 | 942 | 1.605096 |
| 9 | MA | 515 | 765 | 1.443137 |
| 11 | MS | 569 | 734 | 1.628065 |
| 14 | PB | 400 | 540 | 1.570370 |
| 16 | PI | 364 | 499 | 1.543086 |
| 19 | RN | 385 | 495 | 1.630303 |
| 1 | AL | 288 | 419 | 1.446301 |
| 24 | SE | 247 | 351 | 1.492877 |
| 26 | TO | 225 | 289 | 1.643599 |
| 20 | RO | 192 | 257 | 1.599222 |
| 2 | AM | 121 | 150 | 1.653333 |
| 0 | AC | 60 | 82 | 1.560976 |
| 3 | AP | 56 | 70 | 1.714286 |
| 21 | RR | 28 | 47 | 1.340426 |
stack_plot(temp_1,'customer_state',col2='positive_review', col3='total')
# State with the product_category count
plt.figure(figsize=(10,6))
sns.set_style("whitegrid")
ax = df.product_category_name.value_counts().sort_values(ascending=False)[0:15].plot(kind='bar', color = 'grey', alpha=0.8)
ax.set_title("Top selling product categories")
ax.set_xlabel("States")
plt.xticks(rotation=90)
ax.set_ylabel("No of Orders")
plt.show()
df.groupby(["product_category_name"])["product_id"].count().sort_values(ascending=False).head(10)
product_category_name bed_bath_table 10104 health_beauty 8974 sports_leisure 7822 computers_accessories 6875 furniture_decor 6752 housewares 5983 watches_gifts 5753 telephony 4256 auto 3974 toys 3950 Name: product_id, dtype: int64
temp_2 = pd.DataFrame(df.groupby('product_category_name')['review_score'].agg(lambda x: x.eq(2).sum())).reset_index()
# Pandas dataframe grouby count
temp_2['total'] = list(pd.DataFrame(df.groupby('product_category_name')['review_score'].agg([('total','count'),('Avg','mean')]))['total'])
temp_2['Avg'] = list(pd.DataFrame(df.groupby('product_category_name')['review_score'].agg([('total','count'),('Avg','mean')]))['Avg'])
temp_2 = temp_2.sort_values(by=['total'], ascending=True)
temp_2
| product_category_name | review_score | total | Avg | |
|---|---|---|---|---|
| 61 | security_and_services | 1 | 2 | 1.000000 |
| 29 | fashion_childrens_clothes | 7 | 8 | 1.750000 |
| 52 | la_cuisine | 10 | 12 | 1.666667 |
| 11 | cds_dvds_musicals | 11 | 12 | 1.916667 |
| 46 | home_comfort_2 | 16 | 23 | 1.478261 |
| ... | ... | ... | ... | ... |
| 39 | furniture_decor | 4972 | 6752 | 1.561019 |
| 15 | computers_accessories | 5148 | 6875 | 1.579782 |
| 65 | sports_leisure | 6196 | 7822 | 1.658016 |
| 43 | health_beauty | 7114 | 8974 | 1.664364 |
| 7 | bed_bath_table | 7229 | 10104 | 1.532165 |
71 rows × 4 columns
plt.figure(figsize=(22,18))
plt.barh(temp_2.product_category_name,temp_2.total,color='grey',alpha=0.4)
plt.barh(temp_2.product_category_name,temp_2.review_score,color='#2e4884',alpha=0.7)
plt.title('Top Selling Product Categories in Brazilian E-Commerce (2016-2018)',fontsize=22,color='dimgrey')
plt.ylabel('product_category_name',fontsize=14)
plt.xlabel('Total',fontsize=14)
plt.savefig('plot14.png', dpi=480, bbox_inches='tight')
plt.show()
# plotting frequency orders vs the number of consumers
plt.figure(figsize=(14,8))
#counting the consumers and converting it into percentage to visualize the distribution properly
#소비자를 세어 백분율로 환산하여 분포를 제대로 시각화
num_orders=df['customer_unique_id'].value_counts().value_counts()/df.shape[0]*100
num_orders=num_orders.reset_index()
#renaming the columns
num_orders.rename(columns={'index':'number of orders', 'customer_unique_id':'percentage of customers'},inplace=True)
#plotting bar plot
sns.barplot(data=num_orders,x='number of orders',y='percentage of customers',palette='gray')
# plt.yscale('log') #log scale
plt.title('Number of orders per customer',color='dimgrey')
Text(0.5, 1.0, 'Number of orders per customer')
# plotting frequency orders vs the number of consumers
plt.figure(figsize=(14,8))
#counting the consumers and converting it into percentage to visualize the distribution properly
#소비자를 세어 백분율로 환산하여 분포를 제대로 시각화
num_orders=df['customer_unique_id'].value_counts().value_counts()/df.shape[0]*100
num_orders=num_orders.reset_index()
#renaming the columns
num_orders.rename(columns={'index':'number of orders', 'customer_unique_id':'log percentage of customers'},inplace=True)
#plotting bar plot
sns.barplot(data=num_orders,x='number of orders',y='log percentage of customers',palette='gray')
plt.yscale('log') #log scale
plt.title('Number of orders per customer',color='dimgrey')
Text(0.5, 1.0, 'Number of orders per customer')
# matplotlib 축 형식 지정
def format_spines(ax, right_border=True):
"""
This function sets up borders from an axis and personalize colors
Input:
Axis and a flag for deciding or not to plot the right border
Returns:
Plot configuration
"""
# Setting up colors
ax.spines['bottom'].set_color('#CCCCCC')
ax.spines['left'].set_color('#CCCCCC')
ax.spines['top'].set_visible(False)
if right_border:
ax.spines['right'].set_color('#CCCCCC')
else:
ax.spines['right'].set_color('#FFFFFF')
ax.patch.set_facecolor('#FFFFFF')
def single_countplot(df, ax, x=None, y=None, top=None, order=True, hue=False, palette='plasma',
width=0.75, sub_width=0.3, sub_size=12):
"""
Parâmetros
----------
classifiers: conjunto de classificadores em forma de dicionário [dict]
X: array com os dados a serem utilizados no treinamento [np.array]
y: array com o vetor target do modelo [np.array]
Retorno
-------
None
"""
# Verificando plotagem por quebra de alguma variável categórica
ncount = len(df)
if x:
col = x
else:
col = y
# Verificando a plotagem de top categorias
if top is not None:
cat_count = df[col].value_counts()
top_categories = cat_count[:top].index
df = df[df[col].isin(top_categories)]
# Validando demais argumentos e plotando gráfico
if hue != False:
if order:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, order=df[col].value_counts().index, hue=hue)
else:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, hue=hue)
else:
if order:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, order=df[col].value_counts().index)
else:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax)
# Formatando eixos
format_spines(ax, right_border=False)
# Inserindo rótulo de percentual
if x:
for p in ax.patches:
x = p.get_bbox().get_points()[:, 0]
y = p.get_bbox().get_points()[1, 1]
ax.annotate('{}\n{:.1f}%'.format(int(y), 100. * y / ncount), (x.mean(), y), ha='center', va='bottom')
else:
for p in ax.patches:
x = p.get_bbox().get_points()[1, 0]
y = p.get_bbox().get_points()[:, 1]
ax.annotate('{} ({:.1f}%)'.format(int(x), 100. * x / ncount), (x, y.mean()), va='center')
#by using custom countplots in (viz_utils)
fig, ax = plt.subplots(figsize=(14, 6))
single_countplot(df, x='order_status', ax=ax,palette=['grey'])
# sns.barplot(data=df, x='order_status', ax=ax,palette='grey')
plt.title('Order_status',color='dimgrey')
plt.show()
#calulating number of days for the data is taken
print(df.order_approved_at.max() - df.order_approved_at.min(), ' from ',
df.order_approved_at.min(), ' to ', df.order_approved_at.max())
699 days 07:56:34 from 2016-10-04 09:43:32 to 2018-09-03 17:40:06
df_1=df
# Extracting attributes for purchase date - Year and Month
df_1['order_purchase_year'] = df['order_purchase_timestamp'].apply(lambda x: x.year) #gives year Example :2016-10-04 09:43:32 ---->2016
df_1['order_purchase_month'] = df['order_purchase_timestamp'].apply(lambda x: x.month) #gives month Example :2016-10-04 09:43:32 ---->10
df_1['order_purchase_month_name'] = df['order_purchase_timestamp'].apply(lambda x: x.strftime('%b'))#gives month in short form Example :2016-10-04 09:43:32 ---->10--> Oct
df_1['order_purchase_year_month'] = df['order_purchase_timestamp'].apply(lambda x: x.strftime('%Y%m'))#gives month&year Example :2016-10-04 09:43:32 ---->201610
df_1['order_purchase_date'] = df['order_purchase_timestamp'].apply(lambda x: x.strftime('%Y%m%d'))#gives month,yr and date Example :2016-10-04 09:43:32 ---->20161004
df_1['order_purchase_month_yr'] = df['order_purchase_timestamp'].apply(lambda x: x.strftime("%b-%y"))
# Extracting attributes for purchase date - Day and Day of Week
df_1['order_purchase_day'] = df['order_purchase_timestamp'].apply(lambda x: x.day)
df_1['order_purchase_dayofweek'] = df['order_purchase_timestamp'].apply(lambda x: x.dayofweek)
df_1['order_purchase_dayofweek_name'] = df['order_purchase_timestamp'].apply(lambda x: x.strftime('%a'))
# Extracting attributes for purchase date - Hour and Time of the Day
df_1['order_purchase_hour'] = df['order_purchase_timestamp'].apply(lambda x: x.hour)
hours_bins = [-0.1, 6, 12, 18, 23]
hours_labels = ['Dawn', 'Morning', 'Afternoon', 'Night']
df_1['order_purchase_time_day'] = pd.cut(df_1['order_purchase_hour'], hours_bins, labels=hours_labels)
# New DataFrame after transformations
df_1.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_customer_date | order_estimated_delivery_date | payment_sequential | payment_type | payment_installments | payment_value | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | review_score | review_comment_message | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_month_yr | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-10 21:25:13 | 2017-10-18 | 1 | credit_card | 1 | 18.12 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 2 | Não testei o produto ainda, mas ele veio corre... | 2017 | 10 | Oct | 201710 | 20171002 | Oct-17 | 2 | 0 | Mon | 10 | Morning |
| 3 | 128e10d95713541c87cd1a2e48201934 | a20e8105f23924cd00833fd87daa0831 | delivered | 2017-08-15 18:29:31 | 2017-08-15 20:05:16 | 2017-08-18 14:44:43 | 2017-08-28 | 1 | credit_card | 3 | 37.77 | 3a51803cc0d012c3b5dc8b7528cb05f7 | 3366 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-08-21 20:05:16 | 29.99 | 7.78 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 2 | Deveriam embalar melhor o produto. A caixa vei... | 2017 | 8 | Aug | 201708 | 20170815 | Aug-17 | 15 | 1 | Tue | 18 | Afternoon |
| 4 | 0e7e841ddf8f8f2de2bad69267ecfbcf | 26c7ac168e1433912a51b924fbd34d34 | delivered | 2017-08-02 18:24:47 | 2017-08-02 18:43:15 | 2017-08-07 18:30:01 | 2017-08-15 | 1 | credit_card | 1 | 37.77 | ef0996a1a279c26e7ecbd737be23d235 | 2290 | sao paulo | SP | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-08-08 18:37:31 | 29.99 | 7.78 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 2 | Só achei ela pequena pra seis xícaras ,mais é ... | 2017 | 8 | Aug | 201708 | 20170802 | Aug-17 | 2 | 2 | Wed | 18 | Afternoon |
| 5 | bfc39df4f36c3693ff3b63fcbea9e90a | 53904ddbea91e1e92b2b3f1d09a7af86 | delivered | 2017-10-23 23:26:46 | 2017-10-25 02:14:11 | 2017-11-07 18:04:59 | 2017-11-13 | 1 | boleto | 1 | 44.09 | e781fdcc107d13d865fc7698711cc572 | 88032 | florianopolis | SC | 1 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-31 02:14:11 | 29.99 | 14.10 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 1 | no_review | 2017 | 10 | Oct | 201710 | 20171023 | Oct-17 | 23 | 0 | Mon | 23 | Night |
| 6 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-08-07 15:27:45 | 2018-08-13 | 1 | boleto | 1 | 141.46 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 1 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 2 | Muito bom o produto. | 2018 | 7 | Jul | 201807 | 20180724 | Jul-18 | 24 | 1 | Tue | 20 | Night |
plt.figure(figsize=(15,6))
sns.lineplot(data=df_1['order_purchase_year_month'].value_counts().sort_index(),
color='black', linewidth=2)
plt.title('Evolution of Total Orders in Brazilian E-Commerce', size=14, color='dimgrey')
plt.xticks(rotation=90)
plt.show()
df_month = pd.DataFrame()
df_month['date'],df_month['review_score']= list(df.order_approved_at),list(df.review_score)
df_month=df_month.dropna()
df_month = df_month.sort_values(by=['date'])
df_month['monthcount'] = list(df_month.date.apply(lambda x: x.strftime("%b-%y")))
#plotting number of orders per month-year
plt.figure(figsize=(18,6))
g = sns.countplot(x=df_month.monthcount,data=df_month,color='grey',edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Month-Year')
g.set_ylabel('Orders Count')
plt.title('Number of orders per month-year', size=14, color='dimgrey');
plt.figure(figsize=(25,15))
sns.lineplot(data=df_1[df_1['product_category_name']\
=='health_beauty']['order_purchase_year_month'].value_counts().sort_index(),\
label='health_beauty', color="blue")
plt.title('health&beauty Category Orders in Brazilian E-Commerce',fontweight='bold')
Text(0.5, 1.0, 'health&beauty Category Orders in Brazilian E-Commerce')
plt.figure(figsize=(25,15))
sns.lineplot(data=df_1[df_1['product_category_name']\
=='bed_bath_table']['order_purchase_year_month'].value_counts().sort_index(),\
label='bed_bath_table',color="green")
sns.lineplot(data=df_1[df_1['product_category_name']\
=='health_beauty']['order_purchase_year_month'].value_counts().sort_index(),\
label='health_beauty', color="blue")
sns.lineplot(data=df_1[df_1['product_category_name']\
=='sports_leisure']['order_purchase_year_month'].value_counts().sort_index(),\
label='sports_leisure', color="red")
sns.lineplot(data=df_1[df_1['product_category_name']\
=='furniture_decor']['order_purchase_year_month'].value_counts().sort_index(),\
label='home_decoration', color="orange")
sns.lineplot(data=df_1[df_1['product_category_name']\
=='computers_accessories']['order_purchase_year_month'].value_counts().sort_index(),\
label='Informatic_accessories', color="purple")
plt.title('Top5 Category Orders in Brazilian E-Commerce',fontweight='bold')
Text(0.5, 1.0, 'Top5 Category Orders in Brazilian E-Commerce')
plt.figure(figsize=(18,6))
g = sns.countplot(x=df.product_category_name,hue='review_score',data=df,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Month-Year')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per month-year', size=14, color='dimgrey');
df['seller_id_shorten']=df['seller_id'].str[-6:]
df.groupby(['seller_id_shorten'])['seller_id_shorten'].count().sort_values(ascending=False).head(15)
seller_id_shorten 7e94c0 1966 493884 1875 b7556a 1728 523100 1470 dab84a 1439 80ce60 1281 f13abc 1162 3ad736 1158 3b52b2 1135 733cb1 1114 b010ab 1008 0e0bfa 947 35e066 946 301306 758 959fc6 748 Name: seller_id_shorten, dtype: int64
df_topseller=df[(df['seller_id_shorten']=='7e94c0')|(df['seller_id_shorten']=='493884')|(df['seller_id_shorten']=='b7556a')|(df['seller_id_shorten']=='523100')|(df['seller_id_shorten']=='dab84a')|(df['seller_id_shorten']=='80ce60')|(df['seller_id_shorten']=='f13abc')|(df['seller_id_shorten']=='3ad736')|(df['seller_id_shorten']=='3b52b2')|(df['seller_id_shorten']=='733cb1')|(df['seller_id_shorten']=='b010ab')|(df['seller_id_shorten']=='0e0bfa')|(df['seller_id_shorten']=='35e066')|(df['seller_id_shorten']=='301306')|(df['seller_id_shorten']=='959fc6')]
plt.figure(figsize=(18,6))
g = sns.countplot(x=df_topseller.seller_id_shorten,hue='review_score',data=df_topseller,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Top15 Seller')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per Top15 Seller', size=14, color='dimgrey');
df.groupby(['seller_id_shorten'])['seller_id_shorten'].count().sort_values(ascending=False)[300:310]
seller_id_shorten 5d1d6a 74 74e7fb 74 7f7cae 74 afcf62 74 c4d17e 73 d92273 73 c9b4a3 72 6c050c 71 f75a61 71 fe398e 71 Name: seller_id_shorten, dtype: int64
df_lowseller=df[(df['seller_id_shorten']=='724e45')|(df['seller_id_shorten']=='74e7fb')|(df['seller_id_shorten']=='7f7cae')|(df['seller_id_shorten']=='ca1979')|(df['seller_id_shorten']=='c4d17e')|(df['seller_id_shorten']=='216cc1')|(df['seller_id_shorten']=='62ca31')|(df['seller_id_shorten']=='6c050c')|(df['seller_id_shorten']=='f75a61')|(df['seller_id_shorten']=='b99ab5')]
plt.figure(figsize=(18,6))
g = sns.countplot(x=df_lowseller.seller_id_shorten,hue='review_score',data=df_lowseller,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Low10 Seller')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per Low10 Seller', size=14, color='dimgrey');
df_seller_=df[(df['seller_id_shorten']=='7e94c0')|(df['seller_id_shorten']=='493884')|(df['seller_id_shorten']=='b7556a')|(df['seller_id_shorten']=='523100')|(df['seller_id_shorten']=='dab84a')|(df['seller_id_shorten']=='80ce60')|(df['seller_id_shorten']=='f13abc')|(df['seller_id_shorten']=='3ad736')|(df['seller_id_shorten']=='3b52b2')|(df['seller_id_shorten']=='733cb1')|(df['seller_id_shorten']=='b010ab')|(df['seller_id_shorten']=='0e0bfa')|(df['seller_id_shorten']=='35e066')|(df['seller_id_shorten']=='301306')|(df['seller_id_shorten']=='959fc6')|(df['seller_id_shorten']=='724e45')|(df['seller_id_shorten']=='74e7fb')|(df['seller_id_shorten']=='7f7cae')|(df['seller_id_shorten']=='ca1979')|(df['seller_id_shorten']=='c4d17e')|(df['seller_id_shorten']=='216cc1')|(df['seller_id_shorten']=='62ca31')|(df['seller_id_shorten']=='6c050c')|(df['seller_id_shorten']=='f75a61')|(df['seller_id_shorten']=='b99ab5')]
plt.figure(figsize=(18,6))
g = sns.countplot(x=df_seller_.seller_id_shorten,hue='review_score',data=df_seller_,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Seller')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per Seller', size=14, color='dimgrey');
plt.figure(figsize=(18,6))
g = sns.countplot(x=df.product_category_name,hue='review_score',data=df,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Category')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per category', size=14, color='dimgrey');
df.groupby(['product_category_name'])['product_category_name'].count().sort_values(ascending=False).head(15)
product_category_name bed_bath_table 10104 health_beauty 8974 sports_leisure 7822 computers_accessories 6875 furniture_decor 6752 housewares 5983 watches_gifts 5753 telephony 4256 auto 3974 toys 3950 cool_stuff 3629 garden_tools 3589 perfumery 3236 baby 2910 electronics 2546 Name: product_category_name, dtype: int64
df_topcategory=df[(df['product_category_name']=='bed_bath_table')|(df['product_category_name']=='health_beauty')|(df['product_category_name']=='sports_leisure')|(df['product_category_name']=='computers_accessories')|(df['product_category_name']=='furniture_decor')|(df['product_category_name']=='housewares')|(df['product_category_name']=='watches_gifts')|(df['product_category_name']=='telephony')|(df['product_category_name']=='auto')|(df['product_category_name']=='toys')|(df['product_category_name']=='cool_stuff')|(df['product_category_name']=='garden_tools')|(df['product_category_name']=='perfumery')|(df['product_category_name']=='baby')|(df['product_category_name']=='electronics')]
plt.figure(figsize=(18,6))
g = sns.countplot(x=df_topcategory.product_category_name,hue='review_score',data=df_topcategory,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Top15 Category')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per top15 category', size=14, color='dimgrey');
#plotting number of positive and negative reviews per month-year
plt.figure(figsize=(18,6))
g = sns.countplot(x=df_month.monthcount,hue='review_score',data=df_month,palette=['lightgray','gray','#425a90'],edgecolor='grey')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Month-Year')
g.set_ylabel('Orders Count')
plt.title('Number of positive and negative reviews per month-year', size=14, color='dimgrey');
fig = plt.figure(constrained_layout=True, figsize=(17, 20))
# Axis definition
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[1, 0])
ax2 = fig.add_subplot(gs[0, :])
ax3 = fig.add_subplot(gs[1, 1])
# Barchart - Total Reviews by time of the day
single_countplot(df, x='order_purchase_time_day', ax=ax1, order=False, palette=['lightgray','gray','#2e4884'],hue='review_score')
ax1.set_title('Total Reviews by Time of the Day', size=14, color='dimgrey', pad=20)
# Barchart - Total Reviews by month
single_countplot(df, x='order_purchase_month_name', ax=ax2, order=False, palette=['lightgray','gray','#2e4884'],hue='review_score')
ax2.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug','Sep','Oct','Nov','Dec'])
ax2.set_title('Total Reviews by Month', size=14, color='dimgrey', pad=20)
single_countplot(df, x='order_purchase_dayofweek', ax=ax3, order=False, palette=['lightgray','gray','#2e4884'],hue='review_score')
weekday_label = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
ax3.set_xticklabels(weekday_label)
ax3.set_title('Total Reviews by Day of Week', size=14, color='dimgrey', pad=20)
plt.savefig('plot14.png', dpi=300, bbox_inches='tight')
plt.tight_layout()
plt.show()
월별 총 리뷰 서브플롯 : 2016년에서 2018년 사이의 총 리뷰 중 긍정적인 리뷰의 가장 높은 9.0%가 2월에 제공.
5월과 7월에는 총 리뷰 중 8.5% 이상의 리뷰가 긍정적
시간별 총 리뷰 서브플롯 : 오후에 최대 주문 수를 받고 긍정적인 리뷰의 가장 높은 비율(29.8%)도 해당 시간에 제공
요일별 총 리뷰 서브플롯 : 월요일에 최대 주문 수를 받고, 긍정적인 리뷰의 가장 높은 %가 월요일과 화요일
df['day_to_delivery']=((df['order_delivered_customer_date']-df['order_purchase_timestamp']).dt.days)
df_dev = pd.DataFrame()
df_dev['day_to_delivery'],df_dev['review_score']= list(df.day_to_delivery),list(df.review_score)
df_dev=df_dev.dropna()
plt.figure(figsize=(24,6))
plt.title('Order Counts Based on Total delivery Time(in Days)', color='dimgrey')
g = sns.countplot(x=df_dev.day_to_delivery,data=df_dev,color='gray')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Total Days')
g.set_ylabel('Orders Count');
plt.figure(figsize=(24,6))
plt.title('Order Counts Based on Total delivery Time(in Days)', color='dimgrey')
g = sns.countplot(x=df.day_to_delivery,data=df[df['product_category_name']\
=='bed_bath_table']['day_to_delivery'].value_counts().sort_index(),color='gray')
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_xlabel('Total Days')
g.set_ylabel('Orders Count');
df[df['product_category_name']\
=='bed_bath_table']['day_to_delivery'].value_counts().sort_index()
0 2
1 105
2 234
3 310
4 407
...
104 2
106 1
136 1
141 1
148 1
Name: day_to_delivery, Length: 80, dtype: int64
import seaborn as sns
plt.figure(figsize=(26,8))
sns.set_style("whitegrid")
ax = sns.FacetGrid(df, hue="review_score", height=5,aspect=2.0,palette=['#2e4884','gray','black'])
ax = ax.map(sns.distplot, "price").add_legend();
plt.title('Distribution of product price per class')
plt.show()
<Figure size 1872x576 with 0 Axes>
# plotting distributions of freight_value per class
plt.figure()
#sns.set_style("whitegrid")
ax = sns.FacetGrid(df, hue="review_score", height=5,aspect=2.0,palette=['#000080','gray','black'])
ax = ax.map(sns.distplot, "freight_value").add_legend();
plt.title('Distribution of freight_value per class')
plt.show()
<Figure size 432x288 with 0 Axes>
# plotting distributions of product_height_cm per class
sns.set_style("whitegrid")
ax = sns.FacetGrid(df, hue="review_score", height=5,aspect=2.0,palette=['#2e4884','gray','black'])
ax = ax.map(sns.distplot, "product_height_cm").add_legend();
plt.title('Distribution of product_height_cm per class')
plt.show()
# distriution plot of product_weight_g
plt.figure()
sns.set_style("whitegrid")
ax = sns.FacetGrid(df, hue="review_score", height=5,aspect=2.0,palette=['#2e4884','gray','black'])
ax = ax.map(sns.distplot, "product_weight_g").add_legend();
plt.title('Distribution of product_weight_g per class')
plt.show()
<Figure size 432x288 with 0 Axes>
# distriution plot of payment_value
plt.figure()
sns.set_style("whitegrid")
ax = sns.FacetGrid(df, hue="review_score", height=5,aspect=2.0,palette=['#2e4884','gray','black'])
ax = ax.map(sns.distplot, "payment_value").add_legend();
plt.title('Distribution of payment_value per class')
plt.show()
<Figure size 432x288 with 0 Axes>
import matplotlib.pyplot as plt
plt.figure(figsize=(14,6))
box_plot_data=[df.product_length_cm,df.product_height_cm,df.product_width_cm]
plt.boxplot(box_plot_data,labels=['product_length_cm','product_height_cm','product_width_cm'],vert=False)
plt.title("Box Plots of Product Dimensions")
plt.savefig('plot24.png', dpi=400, bbox_inches='tight')
plt.show()
import matplotlib.pyplot as plt
plt.figure(figsize=(20,6))
box_plot_data=[df.payment_value,df.price]
plt.boxplot(box_plot_data,labels=['payment_value','price'],vert=False)
plt.title("Box Plots of Different Prices")
plt.savefig('plot25.png', dpi=400, bbox_inches='tight')
plt.show()
# Distribution of price vs freight_value per class
plt.figure(figsize=(8,5))
sns.set_style("whitegrid")
ax = sns.scatterplot(x='price',y='freight_value', data = df, hue="review_score",palette=['#2e4884','gray','lightgray'])
plt.title('Distribution of price vs freight_value per class')
plt.show()
# Distribution of price vs freight_value per class
plt.figure(figsize=(8,5))
sns.set_style("whitegrid")
ax = sns.scatterplot(x='price',y='product_weight_g', data = df, hue="review_score",palette=['#2e4884','grey','lightgray'])
plt.title('Distribution of price vs product_weight_g per class')
plt.show()
# pair plot
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(df[['product_photos_qty','product_name_lenght','product_description_lenght','review_score']],hue='review_score',palette=['#2e4884','gray','lightgray'])
g.savefig("pairplot1.png")
df_mm=df[['order_purchase_month_name','price']].groupby('order_purchase_month_name').sum()
pi = list(df_mm['price'])
li = list(df_mm.index)
#dict of months and price value
res = {li[i]: pi[i] for i in range(len(li))}
from collections import OrderedDict
mnths = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug','Sep','Oct','Nov','Dec']
weeks=['Sun','Mon','Tue','Wed','Thu','Fri','Sat']
res = dict(OrderedDict(sorted(res.items(),key =lambda x:mnths.index(x[0]))))#sorting by month
print(res)
{'Jan': 995263.7500000386, 'Feb': 1017297.8200000336, 'Mar': 1261957.9599999802, 'Apr': 1270770.219999985, 'May': 1395299.9899999497, 'Jun': 1222437.9599999932, 'Jul': 1290536.3299999775, 'Aug': 1347190.1799999669, 'Sep': 569043.8800000116, 'Oct': 656618.8800000142, 'Nov': 928936.5100000325, 'Dec': 698602.2300000172}
temp_3= pd.DataFrame(df.groupby('order_purchase_month_name')['review_score'].agg(lambda x: x.eq(2).sum())).reset_index()
# Pandas dataframe grouby count
temp_3['total'] = list(pd.DataFrame(df.groupby('order_purchase_month_name')['review_score'].agg([('total','count'),('Avg','mean')]))['total'])
temp_3['Avg'] = list(pd.DataFrame(df.groupby('order_purchase_month_name')['review_score'].agg([('total','count'),('Avg','mean')]))['Avg'])
temp_3= temp_3.sort_values(by=['total'], ascending=True)
rem = {list(temp_3.order_purchase_month_name)[i]: list(temp_3.total)[i] for i in range(len(temp_3))}
rem = dict(OrderedDict(sorted(rem.items(),key =lambda x:mnths.index(x[0]))))
print(rem)
{'Jan': 8286, 'Feb': 8647, 'Mar': 10094, 'Apr': 9629, 'May': 10859, 'Jun': 9700, 'Jul': 10567, 'Aug': 11151, 'Sep': 4370, 'Oct': 5066, 'Nov': 7735, 'Dec': 5774}
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style("whitegrid")
fig, ax1 = plt.subplots()
color = 'grey'
ax1.set_xlabel('Month')
ax1.set_ylabel('price', color=color)
ax1.plot(list(res.keys()),list(res.values()), color=color)
ax1.plot(list(res.keys()),list(res.values()),'C0o', alpha=0.5,color='grey')
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = '#2e4884'
ax2.set_ylabel('orders', color=color) # we already handled the x-label with ax1
ax2.plot(list(res.keys()),list(rem.values()), color=color)
ax2.plot(list(res.keys()),list(rem.values()),'C0o', alpha=0.5,color='#2e4884')
ax2.tick_params(axis='y', labelcolor=color)
#creating points
fig.tight_layout( ) # otherwise the right y-label is slightly clipped
plt.show()
# Reading in the reviews dataset
review_df = pd.read_csv('data/olist_order_reviews_dataset.csv')
review_df.head()
review_df.shape
(99224, 7)
review_data_title = review_df['review_comment_title']
review_data = review_df.drop(['review_comment_title'],axis=1)
# Dropping NaN values
review_data = review_data.dropna()
review_data_title = review_data_title.dropna()
# Resetting the reviews index and visualizing the data
review_data = review_data.reset_index(drop=True)
review_data.head(3)
review_data.shape
(40977, 6)
# Resetting the reviews titles index and visualizing the data
review_data_title = review_data_title.reset_index(drop=True)
review_data_title.head(3)
review_data_title.shape
(11568,)
import nltk
# nltk.download()
# nltk.download('stopwords')
# nltk.download('punkt')
comments = []
stop_words = set(stopwords.words('portuguese'))
for words in review_data['review_comment_message']:
only_letters = re.sub("[^a-zA-Z]", " ",words)
tokens = nltk.word_tokenize(only_letters) #tokenize the sentences
lower_case = [l.lower() for l in tokens] #convert all letters to lower case
filtered_result = list(filter(lambda l: l not in stop_words, lower_case)) #Remove stopwords from the comments
comments.append(' '.join(filtered_result))
#Using wordcloud to visualize the comments
unique_string=(" ").join(comments)
wordcloud = WordCloud(width = 2000, height = 1000,background_color='white').generate(unique_string)
plt.figure(figsize=(20,12))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('plot23.png', dpi=400, bbox_inches='tight')
plt.show()
#further checking by Counting the words
from collections import Counter
words = (" ".join(review_data['review_comment_message'])).lower().split()
counts = Counter(words)
print("Most frequent words:")
sorted(counts.items(), key=lambda x: x[1], reverse=True)[:15]
Most frequent words:
[('o', 18676),
('e', 15582),
('produto', 15462),
('a', 12133),
('de', 11296),
('do', 11139),
('não', 10614),
('que', 8271),
('muito', 7568),
('com', 5574),
('antes', 5533),
('entrega', 5363),
('chegou', 5130),
('prazo', 5043),
('no', 5038)]
print("Least frequent words:")
sorted(counts.items(), key=lambda x: x[1], reverse=False)[:15]
Least frequent words:
[('3desinfector', 1),
('nome...atualizar', 1),
(',travando...pelo', 1),
('decpcionou', 1),
('amim', 1),
('dispensada', 1),
('atrasaram,', 1),
('apaixonada,', 1),
('tomara', 1),
('pelinho.', 1),
('midea', 1),
('split', 1),
('estilo.', 1),
('parceria:', 1),
('rápidíssima,', 1)]
# Getting the number of words by splitting them by a space
words_per_review = df.review_comment_message.apply(lambda x: len(x.split(" ")))
plt.figure(figsize=(10,6))
words_per_review.hist(bins = 100)
plt.xlabel('Review Length (words)')
plt.ylabel('Frequency')
plt.show()
review_messages에 대한 단어 구름이 위에 표시
메시지는 포르투갈어로 되어 있으며 가장 자주 사용되는 단어는 antes prazo, produto entregue, produto chegou(마감 전, 제품 배송, 제품 도착) 등
가장 자주 사용되는 단어는 'o'which의 의미 The이고, 다른 빈번한 단어는 각각 e,produto,awhich의 의미 and, product,the
# Reading all the files
olist_customer = pd.read_csv('data/olist_customers_dataset.csv')
olist_geolocation = pd.read_csv('data/olist_geolocation_dataset.csv')
olist_orders = pd.read_csv('data/olist_orders_dataset.csv')
olist_order_items = pd.read_csv('data/olist_order_items_dataset.csv')
olist_order_payments = pd.read_csv('data/olist_order_payments_dataset.csv')
olist_order_reviews = pd.read_csv('data/olist_order_reviews_dataset.csv')
olist_products = pd.read_csv('data/olist_products_dataset.csv')
olist_sellers = pd.read_csv('data/olist_sellers_dataset.csv')
olist_category = pd.read_csv('data/product_category_name_translation.csv')
# Collections for each dataset
datasets = [olist_customer, olist_geolocation, olist_orders, olist_order_items, olist_order_payments,
olist_order_reviews, olist_products, olist_sellers, olist_category]
names = ['olist_customer', 'olist_geolocation', 'olist_orders', 'olist_order_items', 'olist_order_payments',
'olist_order_reviews', 'olist_products', 'olist_sellers', 'olist_category']
# Creating a DataFrame with useful information about all datasets
data_info = pd.DataFrame({})
data_info['dataset'] = names
data_info['n_rows'] = [df.shape[0] for df in datasets]
data_info['n_cols'] = [df.shape[1] for df in datasets]
data_info['null_amount'] = [df.isnull().sum().sum() for df in datasets]
data_info['qty_null_columns'] = [len([col for col, null in df.isnull().sum().items() if null > 0]) for df in datasets]
data_info['null_columns'] = [', '.join([col for col, null in df.isnull().sum().items() if null > 0]) for df in datasets]
data_info.style.background_gradient()
| dataset | n_rows | n_cols | null_amount | qty_null_columns | null_columns | |
|---|---|---|---|---|---|---|
| 0 | olist_customer | 99441 | 5 | 0 | 0 | |
| 1 | olist_geolocation | 1000163 | 5 | 0 | 0 | |
| 2 | olist_orders | 99441 | 8 | 4908 | 3 | order_approved_at, order_delivered_carrier_date, order_delivered_customer_date |
| 3 | olist_order_items | 112650 | 7 | 0 | 0 | |
| 4 | olist_order_payments | 103886 | 5 | 0 | 0 | |
| 5 | olist_order_reviews | 99224 | 7 | 145903 | 2 | review_comment_title, review_comment_message |
| 6 | olist_products | 32951 | 10 | 3071 | 9 | product_category_name, product_category_name_port, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm |
| 7 | olist_sellers | 3095 | 4 | 0 | 0 | |
| 8 | olist_category | 71 | 2 | 0 | 0 |
# Formatting matplotlib axes
def format_spines(ax, right_border=True):
"""
This function sets up borders from an axis and personalize colors
Input:
Axis and a flag for deciding or not to plot the right border
Returns:
Plot configuration
"""
# Setting up colors
ax.spines['bottom'].set_color('#CCCCCC')
ax.spines['left'].set_color('#CCCCCC')
ax.spines['top'].set_visible(False)
if right_border:
ax.spines['right'].set_color('#CCCCCC')
else:
ax.spines['right'].set_color('#FFFFFF')
ax.patch.set_facecolor('#FFFFFF')
def single_countplot(df, ax, x=None, y=None, top=None, order=True, hue=False, palette='plasma',
width=0.75, sub_width=0.3, sub_size=12):
"""
Parameters
----------
classifiers: dictionary-shaped set of classifiers [dict]
X: array with the data to be used in training [np.array]
y: array with the target vector of the model [np.array]
Return
-------
None
"""
# 범주형 변수를 깨는 플로팅 확인
ncount = len(df)
if x:
col = x
else:
col = y
# 상위 카테고리 플롯 확인
if top is not None:
cat_count = df[col].value_counts()
top_categories = cat_count[:top].index
df = df[df[col].isin(top_categories)]
# 많은 인수 검증 및 그래프 그리기
if hue != False:
if order:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, order=df[col].value_counts().index, hue=hue)
else:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, hue=hue)
else:
if order:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax, order=df[col].value_counts().index)
else:
sns.countplot(x=x, y=y, data=df, palette=palette, ax=ax)
# 축 서식 지정
format_spines(ax, right_border=False)
# 백분율 레이블 삽입
if x:
for p in ax.patches:
x = p.get_bbox().get_points()[:, 0]
y = p.get_bbox().get_points()[1, 1]
ax.annotate('{}\n{:.1f}%'.format(int(y), 100. * y / ncount), (x.mean(), y), ha='center', va='bottom')
else:
for p in ax.patches:
x = p.get_bbox().get_points()[1, 0]
y = p.get_bbox().get_points()[:, 1]
ax.annotate('{} ({:.1f}%)'.format(int(x), 100. * x / ncount), (x, y.mean()), va='center')
df_orders = olist_orders.merge(olist_customer, how='left', on='customer_id')
fig, ax = plt.subplots(figsize=(14, 6))
single_countplot(df_orders, x='order_status', ax=ax)
plt.show()
# Changing the data type for date columns
timestamp_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
'order_estimated_delivery_date']
for col in timestamp_cols:
df_orders[col] = pd.to_datetime(df_orders[col])
# Extracting attributes for purchase date - Year and Month
df_orders['order_purchase_year'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.year)
df_orders['order_purchase_month'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.month)
df_orders['order_purchase_month_name'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.strftime('%b'))
df_orders['order_purchase_year_month'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.strftime('%Y%m'))
df_orders['order_purchase_date'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.strftime('%Y%m%d'))
# Extracting attributes for purchase date - Day and Day of Week
df_orders['order_purchase_day'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.day)
df_orders['order_purchase_dayofweek'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.dayofweek)
df_orders['order_purchase_dayofweek_name'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.strftime('%a'))
# Extracting attributes for purchase date - Hour and Time of the Day
df_orders['order_purchase_hour'] = df_orders['order_purchase_timestamp'].apply(lambda x: x.hour)
hours_bins = [-0.1, 6, 12, 18, 23]
hours_labels = ['Dawn', 'Morning', 'Afternoon', 'Night']
df_orders['order_purchase_time_day'] = pd.cut(df_orders['order_purchase_hour'], hours_bins, labels=hours_labels)
# New DataFrame after transformations
df_orders.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017 | 10 | Oct | 201710 | 20171002 | 2 | 0 | Mon | 10 | Morning |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018 | 7 | Jul | 201807 | 20180724 | 24 | 1 | Tue | 20 | Night |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018 | 8 | Aug | 201808 | 20180808 | 8 | 2 | Wed | 8 | Morning |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017 | 11 | Nov | 201711 | 20171118 | 18 | 5 | Sat | 19 | Night |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018 | 2 | Feb | 201802 | 20180213 | 13 | 1 | Tue | 21 | Night |
fig = plt.figure(constrained_layout=True, figsize=(13, 10))
# Axis definition
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[0, :])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1])
# Lineplot - Evolution of e-commerce orders along time
sns.lineplot(data=df_orders['order_purchase_year_month'].value_counts().sort_index(), ax=ax1,
color='darkslateblue', linewidth=2)
ax1.annotate(f'Highest orders \nreceived', (13, 7500), xytext=(-75, -25),
textcoords='offset points', bbox=dict(boxstyle="round4", fc="w", pad=.8),
arrowprops=dict(arrowstyle='-|>', fc='w'), color='dimgrey', ha='center')
ax1.annotate(f'Noise on data \n(huge decrease)', (23, 0), xytext=(48, 25),
textcoords='offset points', bbox=dict(boxstyle="round4", fc="w", pad=.5),
arrowprops=dict(arrowstyle='-|>', fc='w'), color='dimgrey', ha='center')
format_spines(ax1, right_border=False)
for tick in ax1.get_xticklabels():
tick.set_rotation(45)
ax1.set_title('Evolution of Total Orders in Brazilian E-Commerce', size=14, color='dimgrey')
# Barchart - Total of orders by day of week
single_countplot(df_orders, x='order_purchase_dayofweek', ax=ax2, order=False, palette='YlGnBu')
weekday_label = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
ax2.set_xticklabels(weekday_label)
ax2.set_title('Total Orders by Day of Week', size=14, color='dimgrey', pad=20)
# Barchart - Total of orders by time of the day
day_color_list = ['darkslateblue', 'deepskyblue', 'darkorange', 'purple']
single_countplot(df_orders, x='order_purchase_time_day', ax=ax3, order=False, palette=day_color_list)
ax3.set_title('Total Orders by Time of the Day', size=14, color='dimgrey', pad=20)
plt.tight_layout()
plt.show()
# Creating figure
fig = plt.figure(constrained_layout=True, figsize=(13, 5))
# Axis definition
gs = GridSpec(1, 3, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1:])
# Annotation - Grown on e-commerce orders between 2017 and 2018
df_orders_compare = df_orders.query('order_purchase_year in (2017, 2018) & order_purchase_month <= 8')
year_orders = df_orders_compare['order_purchase_year'].value_counts()
growth = int(round(100 * (1 + year_orders[2017] / year_orders[2018]), 0))
ax1.text(0.00, 0.73, f'{year_orders[2017]}', fontsize=40, color='mediumseagreen', ha='center')
ax1.text(0.00, 0.64, 'orders registered in 2017\nbetween January and August', fontsize=10, ha='center')
ax1.text(0.00, 0.40, f'{year_orders[2018]}', fontsize=60, color='darkslateblue', ha='center')
ax1.text(0.00, 0.31, 'orders registered in 2018\nbetween January and August', fontsize=10, ha='center')
signal = '+' if growth > 0 else '-'
ax1.text(0.00, 0.20, f'{signal}{growth}%', fontsize=14, ha='center', color='white', style='italic', weight='bold',
bbox=dict(facecolor='navy', alpha=0.5, pad=10, boxstyle='round, pad=.7'))
ax1.axis('off')
# Bar chart - Comparison between monthly sales between 2017 and 2018
single_countplot(df_orders_compare, x='order_purchase_month', hue='order_purchase_year', ax=ax2, order=False,
palette='YlGnBu')
month_label = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug']
ax2.set_xticklabels(month_label)
ax2.set_title('Total Orders Comparison Between 2017 and 2018 (January to August)', size=12, color='dimgrey', pad=20)
plt.legend(loc='lower right')
plt.show()
# Merging orders and order_items
df_orders_items = df_orders.merge(olist_order_items, how='left', on='order_id')
# Using the API to bring the region to the data
r = requests.get('https://servicodados.ibge.gov.br/api/v1/localidades/mesorregioes')
content = [c['UF'] for c in json.loads(r.text)]
br_info = pd.DataFrame(content)
br_info['nome_regiao'] = br_info['regiao'].apply(lambda x: x['nome'])
br_info.drop('regiao', axis=1, inplace=True)
br_info.drop_duplicates(inplace=True)
# Threting geolocations outside brazilian map
#Brazils most Northern spot is at 5 deg 16′ 27.8″ N latitude.;
geo_prep = olist_geolocation[olist_geolocation.geolocation_lat <= 5.27438888]
#it’s most Western spot is at 73 deg, 58′ 58.19″W Long.
geo_prep = geo_prep[geo_prep.geolocation_lng >= -73.98283055]
#It’s most southern spot is at 33 deg, 45′ 04.21″ S Latitude.
geo_prep = geo_prep[geo_prep.geolocation_lat >= -33.75116944]
#It’s most Eastern spot is 34 deg, 47′ 35.33″ W Long.
geo_prep = geo_prep[geo_prep.geolocation_lng <= -34.79314722]
geo_group = geo_prep.groupby(by='geolocation_zip_code_prefix', as_index=False).min()
# Merging all the informations
df_orders_items = df_orders_items.merge(br_info, how='left', left_on='customer_state', right_on='sigla')
df_orders_items = df_orders_items.merge(geo_group, how='left', left_on='customer_zip_code_prefix',
right_on='geolocation_zip_code_prefix')
df_orders_items.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | id | sigla | nome | nome_regiao | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017 | 10 | Oct | 201710 | 20171002 | 2 | 0 | Mon | 10 | Morning | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | 35 | SP | São Paulo | Sudeste | 3149.0 | -23.583885 | -46.587473 | sao paulo | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018 | 7 | Jul | 201807 | 20180724 | 24 | 1 | Tue | 20 | Night | 1.0 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | 29 | BA | Bahia | Nordeste | 47813.0 | -12.945892 | -45.016313 | barreiras | BA |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018 | 8 | Aug | 201808 | 20180808 | 8 | 2 | Wed | 8 | Morning | 1.0 | aa4383b373c6aca5d8797843e5594415 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 2018-08-13 08:55:23 | 159.90 | 19.22 | 52 | GO | Goiás | Centro-Oeste | 75265.0 | -16.752735 | -48.522139 | vianopolis | GO |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017 | 11 | Nov | 201711 | 20171118 | 18 | 5 | Sat | 19 | Night | 1.0 | d0b61bfb1de832b15ba9d266ca96e5b0 | 66922902710d126a0e7d26b0e3805106 | 2017-11-23 19:45:59 | 45.00 | 27.20 | 24 | RN | Rio Grande do Norte | Nordeste | 59296.0 | -5.778488 | -35.281450 | sao goncalo do amarante | RN |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018 | 2 | Feb | 201802 | 20180213 | 13 | 1 | Tue | 21 | Night | 1.0 | 65266b2da20d04dbe00c5c2d3bb7859e | 2c9e548be18521d1c43cde1c582c6de8 | 2018-02-19 20:31:37 | 19.90 | 8.72 | 35 | SP | São Paulo | Sudeste | 9195.0 | -23.760018 | -46.524784 | santo andre | SP |
# Filtering data between 201701 and 201808
df_orders_filt = df_orders_items[(df_orders_items['order_purchase_year_month'].astype(int) >= 201701)]
df_orders_filt = df_orders_filt[(df_orders_filt['order_purchase_year_month'].astype(int) <= 201808)]
# Grouping data by region
df_regions_group = df_orders_filt.groupby(by=['order_purchase_year_month', 'nome_regiao'], as_index=False)
df_regions_group = df_regions_group.agg({'customer_id': 'count', 'price': 'sum'}).sort_values(by='order_purchase_year_month')
df_regions_group.columns = ['month', 'region', 'order_count', 'order_amount']
df_regions_group.reset_index(drop=True, inplace=True)
# Grouping data by city (top 10)
df_cities_group = df_orders_filt.groupby(by='geolocation_city',
as_index=False).count().loc[:, ['geolocation_city', 'order_id']]
df_cities_group = df_cities_group.sort_values(by='order_id', ascending=False).reset_index(drop=True)
df_cities_group = df_cities_group.iloc[:10, :]
import matplotlib
from warnings import filterwarnings
filterwarnings('ignore')
from typing import *
from dataclasses import dataclass
from math import ceil
Patch = matplotlib.patches.Patch
PosVal = Tuple[float, Tuple[float, float]]
Axis = matplotlib.axes.Axes
PosValFunc = Callable[[Patch], PosVal]
class AnnotateBars:
font_size: int = 10
color: str = "black"
n_dec: int = 2
def horizontal(self, ax: Axis, centered=False):
def get_vals(p: Patch) -> PosVal:
value = p.get_width()
div = 2 if centered else 1
pos = (
p.get_x() + p.get_width() / div,
p.get_y() + p.get_height() / 2,
)
return value, pos
ha = "center" if centered else "left"
self._annotate(ax, get_vals, ha=ha, va="center")
def vertical(self, ax: Axis, centered:bool=False):
def get_vals(p: Patch) -> PosVal:
value = p.get_height()
div = 2 if centered else 1
pos = (p.get_x() + p.get_width() / 2,
p.get_y() + p.get_height() / div
)
return value, pos
va = "center" if centered else "bottom"
self._annotate(ax, get_vals, ha="center", va=va)
def _annotate(self, ax, func: PosValFunc, **kwargs):
cfg = {"color": self.color,
"fontsize": self.font_size, **kwargs}
for p in ax.patches:
value, pos = func(p)
ax.annotate(f"{value:.{self.n_dec}f}", pos, **cfg)
# Creating and preparing figure and axis
fig = plt.figure(constrained_layout=True, figsize=(15, 10))
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[:, 1])
# Count of orders by region
sns.lineplot(x='month', y='order_count', ax=ax1, data=df_regions_group, hue='region',
size='region', style='region', palette='magma', markers=['o'] * 5)
format_spines(ax1, right_border=False)
ax1.set_title('Evolution of E-Commerce Orders on Brazilian Regions', size=12, color='dimgrey')
ax1.set_ylabel('')
for tick in ax1.get_xticklabels():
tick.set_rotation(45)
# Top cities with more customers orders in Brazil
sns.barplot(y='geolocation_city', x='order_id', data=df_cities_group, ax=ax2, palette='magma')
AnnotateBars(n_dec=0, font_size=10, color='black').horizontal(ax2)
format_spines(ax2, right_border=False)
ax2.set_title('Top 10 Brazilian Cities with More Orders', size=12, color='dimgrey')
ax2.set_ylabel('')
# Total orders by state
single_countplot(y='customer_state', ax=ax3, df=df_orders_filt, palette='viridis')
ax3.set_title('Total of Customers Orders by State', size=12, color='dimgrey')
ax3.set_ylabel('')
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-185-489705615c6e> in <module> 17 # Top cities with more customers orders in Brazil 18 sns.barplot(y='geolocation_city', x='order_id', data=df_cities_group, ax=ax2, palette='magma') ---> 19 AnnotateBars(n_dec=0, font_size=10, color='black').horizontal(ax2) 20 format_spines(ax2, right_border=False) 21 ax2.set_title('Top 10 Brazilian Cities with More Orders', size=12, color='dimgrey') TypeError: AnnotateBars() takes no arguments
# Zipping locations
lats = list(df_orders_items.query('order_purchase_year == 2018')['geolocation_lat'].dropna().values)[:30000]
longs = list(df_orders_items.query('order_purchase_year == 2018')['geolocation_lng'].dropna().values)[:30000]
locations = list(zip(lats, longs))
# Creating a mapa using folium
map1 = folium.Map(location=[-15, -50], zoom_start=4.0)
# Plugin: FastMarkerCluster
FastMarkerCluster(data=locations).add_to(map1)
map1
# Grouping geolocation data for plotting a heatmap
heat_data = df_orders_filt.groupby(by=['geolocation_lat', 'geolocation_lng'], as_index=False).count().iloc[:, :3]
# Creating a mapa using folium
map1 = folium.Map(
location=[-15, -50],
zoom_start=4.0,
tiles='cartodbdark_matter'
)
# Plugin: HeatMap
HeatMap(
name='Mapa de Calor',
data=heat_data,
radius=10,
max_zoom=13
).add_to(map1)
map1
epoch_list = []
heatmap_evl_data = df_orders_items[(df_orders_items['order_purchase_year_month'].astype(int) >= 201701)]
heatmap_evl_data = heatmap_evl_data[(heatmap_evl_data['order_purchase_year_month'].astype(int) <= 201807)]
time_index = heatmap_evl_data['order_purchase_year_month'].sort_values().unique()
for epoch in time_index:
data_temp = heatmap_evl_data.query('order_purchase_year_month == @epoch')
data_temp = data_temp.groupby(by=['geolocation_lat', 'geolocation_lng'], as_index=False).count()
data_temp = data_temp.sort_values(by='order_id', ascending=False).iloc[:, :3]
epoch_list.append(data_temp.values.tolist())
# Creating a mapa using folium
map2 = folium.Map(
location=[-15, -50],
zoom_start=4.0,
tiles='cartodbdark_matter'
)
# Plugin: HeatMapWithTime
HeatMapWithTime(
name='Evolution of Orders in a Geolocation Perspective',
data=epoch_list,
radius=10,
index=list(time_index)
).add_to(map2)
map2
# Zipping locations
lats = list(df_orders_items.query('seller_id == "6560211a19b47992c3666cc44a7e94c0"')['geolocation_lat'].dropna().values)
longs = list(df_orders_items.query('seller_id == "6560211a19b47992c3666cc44a7e94c0"')['geolocation_lng'].dropna().values)
locations = list(zip(lats, longs))
# Creating a mapa using folium
map1 = folium.Map(location=[-15, -50], zoom_start=4.0)
# Plugin: FastMarkerCluster
FastMarkerCluster(data=locations).add_to(map1)
map1
df_orders_items.groupby(['seller_id'])['seller_id'].count().sort_values(ascending=False).head(15)
seller_id 6560211a19b47992c3666cc44a7e94c0 2033 4a3ca9315b744ce9f8e9374361493884 1987 1f50f920176fa81dab994f9023523100 1931 cc419e0650a3c5ba77189a1882b7556a 1775 da8622b14eb17ae2831f4ac5b9dab84a 1551 955fee9216a65b617aa5c0531780ce60 1499 1025f0e2d44d7041d6cf58b6550e0bfa 1428 7c67e1448b00f6e969d365cea6b010ab 1364 ea8482cd71df3c1969d7b9473ff13abc 1203 7a67c85e85bb2ce8582c35f2203ad736 1171 4869f7a5dfa277a7dca6462dcf3b52b2 1156 3d871de0142ce09b7081e2b9d1733cb1 1147 8b321bb669392f5163d04c59e235e066 1018 cca3071e3e9bb7d12640c9fbe2301306 830 620c87c171fb2a6dd6e8bb4dec959fc6 798 Name: seller_id, dtype: int64
df_orders_items
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | id | sigla | nome | nome_regiao | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017 | 10 | Oct | 201710 | 20171002 | 2 | 0 | Mon | 10 | Morning | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | 35 | SP | São Paulo | Sudeste | 3149.0 | -23.583885 | -46.587473 | sao paulo | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018 | 7 | Jul | 201807 | 20180724 | 24 | 1 | Tue | 20 | Night | 1.0 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | 29 | BA | Bahia | Nordeste | 47813.0 | -12.945892 | -45.016313 | barreiras | BA |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018 | 8 | Aug | 201808 | 20180808 | 8 | 2 | Wed | 8 | Morning | 1.0 | aa4383b373c6aca5d8797843e5594415 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 2018-08-13 08:55:23 | 159.90 | 19.22 | 52 | GO | Goiás | Centro-Oeste | 75265.0 | -16.752735 | -48.522139 | vianopolis | GO |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017 | 11 | Nov | 201711 | 20171118 | 18 | 5 | Sat | 19 | Night | 1.0 | d0b61bfb1de832b15ba9d266ca96e5b0 | 66922902710d126a0e7d26b0e3805106 | 2017-11-23 19:45:59 | 45.00 | 27.20 | 24 | RN | Rio Grande do Norte | Nordeste | 59296.0 | -5.778488 | -35.281450 | sao goncalo do amarante | RN |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018 | 2 | Feb | 201802 | 20180213 | 13 | 1 | Tue | 21 | Night | 1.0 | 65266b2da20d04dbe00c5c2d3bb7859e | 2c9e548be18521d1c43cde1c582c6de8 | 2018-02-19 20:31:37 | 19.90 | 8.72 | 35 | SP | São Paulo | Sudeste | 9195.0 | -23.760018 | -46.524784 | santo andre | SP |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113420 | 63943bddc261676b46f01ca7ac2f7bd8 | 1fca14ff2861355f6e5f14306ff977a7 | delivered | 2018-02-06 12:58:58 | 2018-02-06 13:10:37 | 2018-02-07 23:22:42 | 2018-02-28 17:37:56 | 2018-03-02 | da62f9e57a76d978d02ab5362c509660 | 11722 | praia grande | SP | 2018 | 2 | Feb | 201802 | 20180206 | 6 | 1 | Tue | 12 | Morning | 1.0 | f1d4ce8c6dd66c47bbaa8c6781c2a923 | 1f9ab4708f3056ede07124aad39a2554 | 2018-02-12 13:10:37 | 174.90 | 20.10 | 35 | SP | São Paulo | Sudeste | 11722.0 | -24.006742 | -46.457069 | praia grande | SP |
| 113421 | 83c1379a015df1e13d02aae0204711ab | 1aa71eb042121263aafbe80c1b562c9c | delivered | 2017-08-27 14:46:43 | 2017-08-27 15:04:16 | 2017-08-28 20:52:26 | 2017-09-21 11:24:17 | 2017-09-27 | 737520a9aad80b3fbbdad19b66b37b30 | 45920 | nova vicosa | BA | 2017 | 8 | Aug | 201708 | 20170827 | 27 | 6 | Sun | 14 | Afternoon | 1.0 | b80910977a37536adeddd63663f916ad | d50d79cb34e38265a8649c383dcffd48 | 2017-09-05 15:04:16 | 205.99 | 65.02 | 29 | BA | Bahia | Nordeste | 45920.0 | -17.912151 | -39.389831 | nova vicosa | BA |
| 113422 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 | 5097a5312c8b157bb7be58ae360ef43c | 28685 | japuiba | RJ | 2018 | 1 | Jan | 201801 | 20180108 | 8 | 0 | Mon | 21 | Night | 1.0 | d1c427060a0f73f6b889a5c7c61f2ac4 | a1043bafd471dff536d0c462352beb48 | 2018-01-12 21:36:21 | 179.99 | 40.59 | 33 | RJ | Rio de Janeiro | Sudeste | 28685.0 | -22.645479 | -42.775234 | cachoeiras de macacu | RJ |
| 113423 | 11c177c8e97725db2631073c19f07b62 | b331b74b18dc79bcdf6532d51e1637c1 | delivered | 2018-01-08 21:28:27 | 2018-01-08 21:36:21 | 2018-01-12 15:35:03 | 2018-01-25 23:32:54 | 2018-02-15 | 5097a5312c8b157bb7be58ae360ef43c | 28685 | japuiba | RJ | 2018 | 1 | Jan | 201801 | 20180108 | 8 | 0 | Mon | 21 | Night | 2.0 | d1c427060a0f73f6b889a5c7c61f2ac4 | a1043bafd471dff536d0c462352beb48 | 2018-01-12 21:36:21 | 179.99 | 40.59 | 33 | RJ | Rio de Janeiro | Sudeste | 28685.0 | -22.645479 | -42.775234 | cachoeiras de macacu | RJ |
| 113424 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 | 60350aa974b26ff12caad89e55993bd6 | 83750 | lapa | PR | 2018 | 3 | Mar | 201803 | 20180308 | 8 | 3 | Thu | 20 | Night | 1.0 | 006619bbed68b000c8ba3f8725d5409e | ececbfcff9804a2d6b40f589df8eef2b | 2018-03-15 10:55:42 | 68.50 | 18.36 | 41 | PR | Paraná | Sul | 83750.0 | -25.780791 | -49.866078 | lapa | PR |
113425 rows × 37 columns
df_orders_items = df_orders_items.drop(columns=["id","sigla","nome","nome_regiao","geolocation_zip_code_prefix","geolocation_lat","geolocation_lng","geolocation_city","geolocation_state"])
df_orders_items = df_orders_items.merge(olist_products, how='left', on='product_id')
df_orders_items = df_orders_items.merge(olist_sellers, how='left', on='seller_id')
df_orders_items = df_orders_items.merge(br_info, how='left', left_on='seller_state', right_on='sigla')
df_orders_items = df_orders_items.merge(geo_group, how='left', left_on='seller_zip_code_prefix',
right_on='geolocation_zip_code_prefix')
df_orders_items.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | seller_zip_code_prefix | seller_city | seller_state | id | sigla | nome | nome_regiao | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017 | 10 | Oct | 201710 | 20171002 | 2 | 0 | Mon | 10 | Morning | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 9350.0 | maua | SP | 35.0 | SP | São Paulo | Sudeste | 9350.0 | -23.689418 | -46.452454 | maua | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018 | 7 | Jul | 201807 | 20180724 | 24 | 1 | Tue | 20 | Night | 1.0 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 31570.0 | belo horizonte | SP | 35.0 | SP | São Paulo | Sudeste | 31570.0 | -19.813375 | -43.986543 | belo horizonte | MG |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018 | 8 | Aug | 201808 | 20180808 | 8 | 2 | Wed | 8 | Morning | 1.0 | aa4383b373c6aca5d8797843e5594415 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 2018-08-13 08:55:23 | 159.90 | 19.22 | auto | automotivo | 46.0 | 232.0 | 1.0 | 420.0 | 24.0 | 19.0 | 21.0 | 14840.0 | guariba | SP | 35.0 | SP | São Paulo | Sudeste | 14840.0 | -21.396342 | -48.245580 | guariba | SP |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017 | 11 | Nov | 201711 | 20171118 | 18 | 5 | Sat | 19 | Night | 1.0 | d0b61bfb1de832b15ba9d266ca96e5b0 | 66922902710d126a0e7d26b0e3805106 | 2017-11-23 19:45:59 | 45.00 | 27.20 | pet_shop | pet_shop | 59.0 | 468.0 | 3.0 | 450.0 | 30.0 | 10.0 | 20.0 | 31842.0 | belo horizonte | MG | 31.0 | MG | Minas Gerais | Sudeste | 31842.0 | -19.919438 | -44.022020 | belo horizonte | MG |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018 | 2 | Feb | 201802 | 20180213 | 13 | 1 | Tue | 21 | Night | 1.0 | 65266b2da20d04dbe00c5c2d3bb7859e | 2c9e548be18521d1c43cde1c582c6de8 | 2018-02-19 20:31:37 | 19.90 | 8.72 | stationery | papelaria | 38.0 | 316.0 | 4.0 | 250.0 | 51.0 | 15.0 | 15.0 | 8752.0 | mogi das cruzes | SP | 35.0 | SP | São Paulo | Sudeste | 8752.0 | -23.611175 | -46.268467 | mogi das cruzes | SP |
olist_sellers = olist_sellers.merge(br_info, how='left', left_on='seller_state', right_on='sigla')
olist_sellers = olist_sellers.merge(geo_group, how='left', left_on='seller_zip_code_prefix',
right_on='geolocation_zip_code_prefix')
olist_sellers.head()
| seller_id | seller_zip_code_prefix | seller_city | seller_state | id | sigla | nome | nome_regiao | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3442f8959a84dea7ee197c632cb2df15 | 13023 | campinas | SP | 35 | SP | São Paulo | Sudeste | 13023.0 | -22.900175 | -47.066168 | campinas | SP |
| 1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | mogi guacu | SP | 35 | SP | São Paulo | Sudeste | 13844.0 | -22.395891 | -46.971691 | mogi guacu | SP |
| 2 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | rio de janeiro | RJ | 33 | RJ | Rio de Janeiro | Sudeste | 20031.0 | -22.912577 | -43.519359 | rio de janeiro | RJ |
| 3 | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195 | sao paulo | SP | 35 | SP | São Paulo | Sudeste | 4195.0 | -23.659907 | -46.615532 | sao paulo | SP |
| 4 | 51a04a8a6bdcb23deccc82b0b80742cf | 12914 | braganca paulista | SP | 35 | SP | São Paulo | Sudeste | 12914.0 | -22.988744 | -46.540353 | braganca paulista | SP |
1위 셀러 위치
# Zipping locations
lats = list(olist_sellers.query('seller_id == "6560211a19b47992c3666cc44a7e94c0"')['geolocation_lat'].dropna().values)
longs = list(olist_sellers.query('seller_id == "6560211a19b47992c3666cc44a7e94c0"')['geolocation_lng'].dropna().values)
locations = list(zip(lats, longs))
# Creating a mapa using folium
map1 = folium.Map(location=[-15, -50], zoom_start=4.0)
# Plugin: FastMarkerCluster
FastMarkerCluster(data=locations).add_to(map1)
map1
#Deduplication of entries
df_cat_seller= df_orders_items.drop_duplicates(subset={'product_id'}, keep='first', inplace=False)
df_cat_seller= df_cat_seller.reindex()
df_cat_seller
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | seller_zip_code_prefix | seller_city | seller_state | id | sigla | nome | nome_regiao | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017 | 10 | Oct | 201710 | 20171002 | 2 | 0 | Mon | 10 | Morning | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 9350.0 | maua | SP | 35.0 | SP | São Paulo | Sudeste | 9350.0 | -23.689418 | -46.452454 | maua | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018 | 7 | Jul | 201807 | 20180724 | 24 | 1 | Tue | 20 | Night | 1.0 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 31570.0 | belo horizonte | SP | 35.0 | SP | São Paulo | Sudeste | 31570.0 | -19.813375 | -43.986543 | belo horizonte | MG |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018 | 8 | Aug | 201808 | 20180808 | 8 | 2 | Wed | 8 | Morning | 1.0 | aa4383b373c6aca5d8797843e5594415 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 2018-08-13 08:55:23 | 159.90 | 19.22 | auto | automotivo | 46.0 | 232.0 | 1.0 | 420.0 | 24.0 | 19.0 | 21.0 | 14840.0 | guariba | SP | 35.0 | SP | São Paulo | Sudeste | 14840.0 | -21.396342 | -48.245580 | guariba | SP |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017 | 11 | Nov | 201711 | 20171118 | 18 | 5 | Sat | 19 | Night | 1.0 | d0b61bfb1de832b15ba9d266ca96e5b0 | 66922902710d126a0e7d26b0e3805106 | 2017-11-23 19:45:59 | 45.00 | 27.20 | pet_shop | pet_shop | 59.0 | 468.0 | 3.0 | 450.0 | 30.0 | 10.0 | 20.0 | 31842.0 | belo horizonte | MG | 31.0 | MG | Minas Gerais | Sudeste | 31842.0 | -19.919438 | -44.022020 | belo horizonte | MG |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018 | 2 | Feb | 201802 | 20180213 | 13 | 1 | Tue | 21 | Night | 1.0 | 65266b2da20d04dbe00c5c2d3bb7859e | 2c9e548be18521d1c43cde1c582c6de8 | 2018-02-19 20:31:37 | 19.90 | 8.72 | stationery | papelaria | 38.0 | 316.0 | 4.0 | 250.0 | 51.0 | 15.0 | 15.0 | 8752.0 | mogi das cruzes | SP | 35.0 | SP | São Paulo | Sudeste | 8752.0 | -23.611175 | -46.268467 | mogi das cruzes | SP |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113390 | 87b4c933f31145a28413b39d880ad6c3 | ddfdf5e9b2659e1fbd073404c9b762e0 | delivered | 2018-03-02 17:12:05 | 2018-03-02 17:49:24 | 2018-03-06 21:23:32 | 2018-03-16 17:21:55 | 2018-03-21 | 490c3c53ca927221bccbb00706afbcf9 | 2042 | sao paulo | SP | 2018 | 3 | Mar | 201803 | 20180302 | 2 | 4 | Fri | 17 | Afternoon | 1.0 | cbaf5898b92064cd5e399c7cff291d0a | 06a2c3af7b3aee5d69171b0e14f0ee87 | 2018-03-08 17:31:06 | 57.99 | 27.26 | health_beauty | beleza_saude | 50.0 | 1031.0 | 1.0 | 350.0 | 19.0 | 12.0 | 13.0 | 65072.0 | sao luis | MA | 21.0 | MA | Maranhão | Nordeste | 65072.0 | -2.567177 | -44.309276 | sao luis | MA |
| 113395 | e8fd20068b9f7e6ec07068bb7537f781 | 609b9fb8cad4fe0c7b376f77c8ab76ad | delivered | 2017-08-10 21:21:07 | 2017-08-10 21:35:26 | 2017-08-18 16:42:51 | 2017-08-23 15:36:29 | 2017-08-31 | fb9310710003399b031add3e55f34719 | 3318 | sao paulo | SP | 2017 | 8 | Aug | 201708 | 20170810 | 10 | 3 | Thu | 21 | Night | 1.0 | 0df37da38a30a713453b03053d60d3f7 | 218d46b86c1881d022bce9c68a7d4b15 | 2017-08-17 21:35:26 | 356.00 | 18.12 | sports_leisure | esporte_lazer | 53.0 | 1105.0 | 1.0 | 2800.0 | 28.0 | 23.0 | 23.0 | 14070.0 | ribeirao preto | SP | 35.0 | SP | São Paulo | Sudeste | 14070.0 | -21.124146 | -47.795530 | ribeirao preto | SP |
| 113414 | cfa78b997e329a5295b4ee6972c02979 | a2f7428f0cafbc8e59f20e1444b67315 | delivered | 2017-12-20 09:52:41 | 2017-12-20 10:09:52 | 2017-12-20 20:25:25 | 2018-01-26 15:45:14 | 2018-01-18 | a49e8e11e850592fe685ae3c64b40eca | 83870 | campo do tenente | PR | 2017 | 12 | Dec | 201712 | 20171220 | 20 | 2 | Wed | 9 | Morning | 1.0 | 3d2c44374ee42b3003a470f3e937a2ea | ce248b21cb2adc36282ede306b7660e5 | 2017-12-27 10:09:52 | 55.90 | 15.14 | musical_instruments | instrumentos_musicais | 34.0 | 372.0 | 2.0 | 300.0 | 16.0 | 6.0 | 12.0 | 89251.0 | jaragua do sul | SC | 42.0 | SC | Santa Catarina | Sul | 89251.0 | -26.494915 | -49.089816 | jaragua do sul | SC |
| 113419 | 9c5dedf39a927c1b2549525ed64a053c | 39bd1228ee8140590ac3aca26f2dfe00 | delivered | 2017-03-09 09:54:05 | 2017-03-09 09:54:05 | 2017-03-10 11:18:03 | 2017-03-17 15:08:01 | 2017-03-28 | 6359f309b166b0196dbf7ad2ac62bb5a | 12209 | sao jose dos campos | SP | 2017 | 3 | Mar | 201703 | 20170309 | 9 | 3 | Thu | 9 | Morning | 1.0 | ac35486adb7b02598c182c2ff2e05254 | e24fc9fcd865784fb25705606fe3dfe7 | 2017-03-15 09:54:05 | 72.00 | 13.08 | health_beauty | beleza_saude | 50.0 | 1517.0 | 1.0 | 1175.0 | 22.0 | 13.0 | 18.0 | 12913.0 | braganca paulista | SP | 35.0 | SP | São Paulo | Sudeste | 12913.0 | -22.963104 | -46.527803 | braganca paulista | SP |
| 113424 | 66dea50a8b16d9b4dee7af250b4be1a5 | edb027a75a1449115f6b43211ae02a24 | delivered | 2018-03-08 20:57:30 | 2018-03-09 11:20:28 | 2018-03-09 22:11:59 | 2018-03-16 13:08:30 | 2018-04-03 | 60350aa974b26ff12caad89e55993bd6 | 83750 | lapa | PR | 2018 | 3 | Mar | 201803 | 20180308 | 8 | 3 | Thu | 20 | Night | 1.0 | 006619bbed68b000c8ba3f8725d5409e | ececbfcff9804a2d6b40f589df8eef2b | 2018-03-15 10:55:42 | 68.50 | 18.36 | health_beauty | beleza_saude | 45.0 | 569.0 | 1.0 | 150.0 | 16.0 | 7.0 | 15.0 | 14407.0 | franca | SP | 35.0 | SP | São Paulo | Sudeste | 14407.0 | -20.514694 | -47.432886 | franca | SP |
32952 rows × 49 columns
bed_bath_table 카테고리 제품 위치 분포
# Zipping locations
lats = list(df_cat_seller.query('product_category_name == "bed_bath_table"')['geolocation_lat'].dropna().values)
longs = list(df_cat_seller.query('product_category_name == "bed_bath_table"')['geolocation_lng'].dropna().values)
locations = list(zip(lats, longs))
# Creating a mapa using folium
map1 = folium.Map(location=[-15, -50], zoom_start=4.0)
# Plugin: FastMarkerCluster
FastMarkerCluster(data=locations).add_to(map1)
map1
#Deduplication of entries
df_cat_seller= df_cat_seller.drop_duplicates(subset={'product_category_name','seller_id'}, keep='first', inplace=False)
df_cat_seller= df_cat_seller.reindex()
df_cat_seller
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | order_purchase_year | order_purchase_month | order_purchase_month_name | order_purchase_year_month | order_purchase_date | order_purchase_day | order_purchase_dayofweek | order_purchase_dayofweek_name | order_purchase_hour | order_purchase_time_day | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | product_category_name | product_category_name_port | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | seller_zip_code_prefix | seller_city | seller_state | id | sigla | nome | nome_regiao | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | 7c396fd4830fd04220f754e42b4e5bff | 3149 | sao paulo | SP | 2017 | 10 | Oct | 201710 | 20171002 | 2 | 0 | Mon | 10 | Morning | 1.0 | 87285b34884572647811a353c7ac498a | 3504c0cb71d7fa48d967e0e4c94d59d9 | 2017-10-06 11:07:15 | 29.99 | 8.72 | housewares | utilidades_domesticas | 40.0 | 268.0 | 4.0 | 500.0 | 19.0 | 8.0 | 13.0 | 9350.0 | maua | SP | 35.0 | SP | São Paulo | Sudeste | 9350.0 | -23.689418 | -46.452454 | maua | SP |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | af07308b275d755c9edb36a90c618231 | 47813 | barreiras | BA | 2018 | 7 | Jul | 201807 | 20180724 | 24 | 1 | Tue | 20 | Night | 1.0 | 595fac2a385ac33a80bd5114aec74eb8 | 289cdb325fb7e7f891c38608bf9e0962 | 2018-07-30 03:24:27 | 118.70 | 22.76 | perfumery | perfumaria | 29.0 | 178.0 | 1.0 | 400.0 | 19.0 | 13.0 | 19.0 | 31570.0 | belo horizonte | SP | 35.0 | SP | São Paulo | Sudeste | 31570.0 | -19.813375 | -43.986543 | belo horizonte | MG |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | 3a653a41f6f9fc3d2a113cf8398680e8 | 75265 | vianopolis | GO | 2018 | 8 | Aug | 201808 | 20180808 | 8 | 2 | Wed | 8 | Morning | 1.0 | aa4383b373c6aca5d8797843e5594415 | 4869f7a5dfa277a7dca6462dcf3b52b2 | 2018-08-13 08:55:23 | 159.90 | 19.22 | auto | automotivo | 46.0 | 232.0 | 1.0 | 420.0 | 24.0 | 19.0 | 21.0 | 14840.0 | guariba | SP | 35.0 | SP | São Paulo | Sudeste | 14840.0 | -21.396342 | -48.245580 | guariba | SP |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | 7c142cf63193a1473d2e66489a9ae977 | 59296 | sao goncalo do amarante | RN | 2017 | 11 | Nov | 201711 | 20171118 | 18 | 5 | Sat | 19 | Night | 1.0 | d0b61bfb1de832b15ba9d266ca96e5b0 | 66922902710d126a0e7d26b0e3805106 | 2017-11-23 19:45:59 | 45.00 | 27.20 | pet_shop | pet_shop | 59.0 | 468.0 | 3.0 | 450.0 | 30.0 | 10.0 | 20.0 | 31842.0 | belo horizonte | MG | 31.0 | MG | Minas Gerais | Sudeste | 31842.0 | -19.919438 | -44.022020 | belo horizonte | MG |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | 72632f0f9dd73dfee390c9b22eb56dd6 | 9195 | santo andre | SP | 2018 | 2 | Feb | 201802 | 20180213 | 13 | 1 | Tue | 21 | Night | 1.0 | 65266b2da20d04dbe00c5c2d3bb7859e | 2c9e548be18521d1c43cde1c582c6de8 | 2018-02-19 20:31:37 | 19.90 | 8.72 | stationery | papelaria | 38.0 | 316.0 | 4.0 | 250.0 | 51.0 | 15.0 | 15.0 | 8752.0 | mogi das cruzes | SP | 35.0 | SP | São Paulo | Sudeste | 8752.0 | -23.611175 | -46.268467 | mogi das cruzes | SP |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 113061 | 536dc10adea685bee0cb83614fe3353f | 40050a9445eb9acbd618fb333252b5ca | delivered | 2018-06-30 20:03:53 | 2018-06-30 20:15:06 | 2018-07-03 12:02:00 | 2018-07-11 00:21:47 | 2018-07-30 | cedd18bad262f4bb3507ce420c8cf548 | 44095 | feira de santana | BA | 2018 | 6 | Jun | 201806 | 20180630 | 30 | 5 | Sat | 20 | Night | 1.0 | dff7fe87b6455dba4be604aa1d2938fa | 1cbd32d00d01bb8087a5eb088612fd9c | 2018-07-09 20:15:06 | 138.23 | 24.12 | computers_accessories | informatica_acessorios | 45.0 | 263.0 | 3.0 | 1042.0 | 36.0 | 18.0 | 33.0 | 3363.0 | sp / sp | SP | 35.0 | SP | São Paulo | Sudeste | 3363.0 | -23.564852 | -46.543604 | sao paulo | SP |
| 113190 | 25d2bfa43663a23586afd12f15b542e7 | 9d8c06734fde9823ace11a4b5929b5a7 | delivered | 2018-05-22 21:13:21 | 2018-05-22 21:35:40 | 2018-05-24 12:28:00 | 2018-06-12 23:11:29 | 2018-06-08 | e55e436481078787e32349cee9febf5e | 39803 | teofilo otoni | MG | 2018 | 5 | May | 201805 | 20180522 | 22 | 1 | Tue | 21 | Night | 1.0 | 6e1c2008dea1929b9b6c27fa01381e90 | edf3fabebcc20f7463cc9c53da932ea8 | 2018-05-28 21:31:24 | 219.90 | 24.12 | furniture_decor | moveis_decoracao | 19.0 | 531.0 | 1.0 | 5900.0 | 41.0 | 21.0 | 41.0 | 8320.0 | sao paulo | SP | 35.0 | SP | São Paulo | Sudeste | 8320.0 | -23.637826 | -46.501854 | sao paulo | SP |
| 113296 | 1565f22aa9452ff278638e87cc895678 | 56772dfbcbe7df908a284ff0d53adf7d | delivered | 2018-05-15 17:41:00 | 2018-05-16 03:35:29 | 2018-05-16 17:20:00 | 2018-05-21 14:31:41 | 2018-05-29 | 6ceea7c1088e15ab3c67980a2d9bb309 | 9687 | sao bernardo do campo | SP | 2018 | 5 | May | 201805 | 20180515 | 15 | 1 | Tue | 17 | Afternoon | 1.0 | 9c1e194db1d35a79d962ea610bfe0868 | f3862c2188522d89860c38a3ea8b550d | 2018-05-22 03:35:29 | 15.50 | 12.79 | perfumery | perfumaria | 40.0 | 871.0 | 1.0 | 83.0 | 17.0 | 8.0 | 13.0 | 14092.0 | ribeirao preto | SP | 35.0 | SP | São Paulo | Sudeste | 14092.0 | -21.184373 | -47.782852 | ribeirao preto | SP |
| 113336 | 32cf1aa0aa0ea3a91c9779d558515782 | a365ff91be1ba9a1e7c3251c24848f74 | delivered | 2018-08-01 16:13:13 | 2018-08-01 17:24:08 | 2018-08-06 13:23:00 | 2018-08-13 22:41:24 | 2018-08-24 | 5836ceeb27eff5b9b4747e0f4c250297 | 28680 | cachoeiras de macacu | RJ | 2018 | 8 | Aug | 201808 | 20180801 | 1 | 2 | Wed | 16 | Afternoon | 1.0 | 32f186a3f6239888c37adf90db857098 | 90b36c5aeb728d5504a39f435bef0c5f | 2018-08-07 17:24:08 | 29.90 | 23.01 | food_drink | alimentos_bebidas | 57.0 | 1322.0 | 1.0 | 1350.0 | 18.0 | 17.0 | 16.0 | 13835.0 | conchal | SP | 35.0 | SP | São Paulo | Sudeste | 13835.0 | -22.379361 | -47.178108 | conchal | SP |
| 113358 | 9b7fd5b0ef330dcbbb6397e5971faeec | 4442f152efb5f70fd55dcd6267d1afa4 | delivered | 2017-05-17 15:48:44 | 2017-05-17 16:15:16 | 2017-05-18 13:30:39 | 2017-05-29 15:18:32 | 2017-06-08 | 45265f9d6c53836d632f904a3c1d0f1c | 13604 | araras | SP | 2017 | 5 | May | 201705 | 20170517 | 17 | 2 | Wed | 15 | Afternoon | 1.0 | afb83a05bfcf46258f04ce060671d5b4 | e2aee0892199b1d92530e371abd825bf | 2017-05-23 16:15:16 | 39.00 | 15.10 | fixed_telephony | telefonia_fixa | 49.0 | 709.0 | 5.0 | 150.0 | 17.0 | 3.0 | 12.0 | 86600.0 | rolandia | PR | 41.0 | PR | Paraná | Sul | 86600.0 | -23.340759 | -51.399937 | rolandia | PR |
6433 rows × 49 columns
bed_bath_table 카테고리 seller 위치 분포
# Zipping locations
lats = list(df_cat_seller.query('product_category_name == "bed_bath_table"')['geolocation_lat'].dropna().values)
longs = list(df_cat_seller.query('product_category_name == "bed_bath_table"')['geolocation_lng'].dropna().values)
locations = list(zip(lats, longs))
# Creating a mapa using folium
map1 = folium.Map(location=[-15, -50], zoom_start=4.0)
# Plugin: FastMarkerCluster
FastMarkerCluster(data=locations).add_to(map1)
map1
주문가격, 운임 등을 살펴보고 전자상거래에 의해 이동하는 돈을 분석
# Grouping data
df_month_aggreg = df_orders_filt.groupby(by=['order_purchase_year', 'order_purchase_year_month'], as_index=False)
df_month_aggreg = df_month_aggreg.agg({
'order_id': 'count',
'price': 'sum',
'freight_value': 'sum'
})
# Adding new columns for analysis
df_month_aggreg['price_per_order'] = df_month_aggreg['price'] / df_month_aggreg['order_id']
df_month_aggreg['freight_per_order'] = df_month_aggreg['freight_value'] / df_month_aggreg['order_id']
df_month_aggreg.head()
| order_purchase_year | order_purchase_year_month | order_id | price | freight_value | price_per_order | freight_per_order | |
|---|---|---|---|---|---|---|---|
| 0 | 2017 | 201701 | 966 | 120312.87 | 16875.62 | 124.547484 | 17.469586 |
| 1 | 2017 | 201702 | 1998 | 247303.02 | 38977.60 | 123.775285 | 19.508308 |
| 2 | 2017 | 201703 | 3041 | 374344.30 | 57704.29 | 123.099079 | 18.975432 |
| 3 | 2017 | 201704 | 2697 | 359927.23 | 52495.01 | 133.454664 | 19.464223 |
| 4 | 2017 | 201705 | 4176 | 506071.14 | 80119.81 | 121.185618 | 19.185778 |
# Creating figure
fig = plt.figure(constrained_layout=True, figsize=(15, 12))
# Axis definition
gs = GridSpec(2, 3, figure=fig)
ax1 = fig.add_subplot(gs[0, :])
ax2 = fig.add_subplot(gs[1, 0])
ax3 = fig.add_subplot(gs[1, 1:])
# Plot 1 - Evolution of total orders and total sales on e-commerce
sns.lineplot(x='order_purchase_year_month', y='price', ax=ax1, data=df_month_aggreg, linewidth=2,
color='darkslateblue', marker='o', label='Total Amount')
ax1_twx = ax1.twinx()
single_countplot(df_orders_filt, x='order_purchase_year_month', ax=ax1_twx, order=False, palette='YlGnBu_r')
ax1_twx.set_yticks(np.arange(0, 20000, 2500))
# Customizing the first plot
format_spines(ax1)
for tick in ax1.get_xticklabels():
tick.set_rotation(45)
for x, y in df_month_aggreg.price.items():
ax1.annotate(str(round(y/1000, 1))+'K', xy=(x, y), textcoords='offset points', xytext=(0, 10),
ha='center', color='dimgrey')
ax1.annotate(f'Highest Value Sold on History\n(Black Friday?)', (10, 1000000), xytext=(-120, -20),
textcoords='offset points', bbox=dict(boxstyle="round4", fc="w", pad=.8),
arrowprops=dict(arrowstyle='-|>', fc='w'), color='dimgrey', ha='center')
ax1.set_title('Evolution of E-commerce: Total Orders and Total Amount Sold (R$)', size=14, color='dimgrey', pad=20)
# Plot 2 - Big Numbers of Sales Evolution
month_comparison = ['201701', '201702', '201703', '201704', '201705', '201706', '201707', '201708',
'201801', '201802', '201803', '201804', '201805', '201806', '201807', '201808']
df_sales_compare = df_month_aggreg.query('order_purchase_year_month in (@month_comparison)')
sold_2017 = df_sales_compare.query('order_purchase_year == 2017')['price'].sum()
sold_2018 = df_sales_compare.query('order_purchase_year == 2018')['price'].sum()
growth = 1 + (sold_2017 / sold_2018)
ax2.text(0.50, 0.73, 'R$' + str(round(sold_2017/1000000, 2)) + 'M', fontsize=40, color='mediumseagreen', ha='center')
ax2.text(0.50, 0.60, 'total amount sold in 2017\nbetween January and August', fontsize=10, ha='center')
ax2.text(0.50, 0.36, 'R$' + str(round(sold_2018/1000000, 2)) + 'M', fontsize=60, color='darkslateblue', ha='center')
ax2.text(0.50, 0.25, 'total amount sold in 2018\nbetween January and August', fontsize=10, ha='center')
signal = '+' if growth > 0 else '-'
ax2.text(0.50, 0.13, f'{signal}{str(round(100 * growth, 2))}%', fontsize=14, ha='center', color='white', style='italic', weight='bold',
bbox=dict(facecolor='navy', alpha=0.5, pad=10, boxstyle='round, pad=.7'))
ax2.axis('off')
# Plot 3 - Evolution of mean freight value paid by the customers
sns.lineplot(x='order_purchase_year_month', y='freight_per_order', data=df_month_aggreg, linewidth=2,
color='silver', marker='o', ax=ax3)
format_spines(ax3, right_border=False)
for tick in ax3.get_xticklabels():
tick.set_rotation(45)
for x, y in df_month_aggreg.freight_per_order.items():
ax3.annotate(round(y, 2), xy=(x, y), textcoords='offset points', xytext=(0, 10),
ha='center', color='dimgrey')
ax3.set_title('Evolution of Average Freight Value (RS) Paid by Customers', size=14, color='dimgrey', pad=20)
plt.tight_layout()
plt.show()
브라질 국가에서 총 판매액(가격 합계)이 어디 집중되는지
def mean_sum_analysis(df, group_col, value_col, orient='vertical', palette='plasma', figsize=(15, 6)):
"""
Parâmetros
----------
classifiers: conjunto de classificadores em forma de dicionário [dict]
X: array com os dados a serem utilizados no treinamento [np.array]
y: array com o vetor target do modelo [np.array]
Retorno
-------
None
"""
# Grouping data
df_mean = df.groupby(group_col, as_index=False).mean()
df_sum = df.groupby(group_col, as_index=False).sum()
# Sorting grouped dataframes
df_mean.sort_values(by=value_col, ascending=False, inplace=True)
sorter = list(df_mean[group_col].values)
sorter_idx = dict(zip(sorter, range(len(sorter))))
df_sum['mean_rank'] = df_mean[group_col].map(sorter_idx)
df_sum.sort_values(by='mean_rank', inplace=True)
df_sum.drop('mean_rank', axis=1, inplace=True)
# Plotting data
fig, axs = plt.subplots(ncols=2, figsize=figsize)
if orient == 'vertical':
sns.barplot(x=value_col, y=group_col, data=df_mean, ax=axs[0], palette=palette)
sns.barplot(x=value_col, y=group_col, data=df_sum, ax=axs[1], palette=palette)
AnnotateBars(n_dec=0, font_size=12, color='black').horizontal(axs[0])
AnnotateBars(n_dec=0, font_size=12, color='black').horizontal(axs[1])
elif orient == 'horizontal':
sns.barplot(x=group_col, y=value_col, data=df_mean, ax=axs[0], palette=palette)
sns.barplot(x=group_col, y=value_col, data=df_sum, ax=axs[1], palette=palette)
AnnotateBars(n_dec=0, font_size=12, color='black').vertical(axs[0])
AnnotateBars(n_dec=0, font_size=12, color='black').vertical(axs[1])
# Customizing plot
for ax in axs:
format_spines(ax, right_border=False)
ax.set_ylabel('')
axs[0].set_title(f'Mean of {value_col} by {group_col}', size=14, color='dimgrey')
axs[1].set_title(f'Sum of {value_col} by {group_col}', size=14, color='dimgrey')
plt.tight_layout()
plt.show()
mean_sum_analysis(df_orders_filt, 'customer_state', 'price', palette='viridis', figsize=(15, 10))
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-207-761fc96bd112> in <module> ----> 1 mean_sum_analysis(df_orders_filt, 'customer_state', 'price', palette='viridis', figsize=(15, 10)) <ipython-input-206-e44f41db7d28> in mean_sum_analysis(df, group_col, value_col, orient, palette, figsize) 29 sns.barplot(x=value_col, y=group_col, data=df_mean, ax=axs[0], palette=palette) 30 sns.barplot(x=value_col, y=group_col, data=df_sum, ax=axs[1], palette=palette) ---> 31 AnnotateBars(n_dec=0, font_size=12, color='black').horizontal(axs[0]) 32 AnnotateBars(n_dec=0, font_size=12, color='black').horizontal(axs[1]) 33 elif orient == 'horizontal': TypeError: AnnotateBars() takes no arguments
mean_sum_analysis(df_orders_filt, 'customer_state', 'freight_value', palette='viridis', figsize=(15, 10))
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-208-b9b205be5667> in <module> ----> 1 mean_sum_analysis(df_orders_filt, 'customer_state', 'freight_value', palette='viridis', figsize=(15, 10)) <ipython-input-206-e44f41db7d28> in mean_sum_analysis(df, group_col, value_col, orient, palette, figsize) 29 sns.barplot(x=value_col, y=group_col, data=df_mean, ax=axs[0], palette=palette) 30 sns.barplot(x=value_col, y=group_col, data=df_sum, ax=axs[1], palette=palette) ---> 31 AnnotateBars(n_dec=0, font_size=12, color='black').horizontal(axs[0]) 32 AnnotateBars(n_dec=0, font_size=12, color='black').horizontal(axs[1]) 33 elif orient == 'horizontal': TypeError: AnnotateBars() takes no arguments
def calc_working_days(date_series1, date_series2, convert=True):
"""
This functions receives two date series as args and calculates the working days between each of its rows.
Parameters
----------
:param date_series1: first date series to be used on working days calculation [type: pd.Series]
:param date_series2: second date series to subtract the first one [type: pd.Series]
:param convert: flag that guides the series conversions in datetime objects [type: bool, default: True]
Return
------
:return: wd_list: list with working days calculations between two date series
Application
-----------
# Calculating the working days between two date series
working_days = calc_working_days(df['purchase_date'], df['delivered_date'], convert=True)
"""
# Auxiliar function for threating exceptions during the np.busday_count() function
def handle_working_day_calc(d1, d2):
try:
date_diff = np.busday_count(d1, d2)
return date_diff
except:
return np.NaN
# Applying conversion on series in datetime data
if convert:
date_series1 = pd.to_datetime(date_series1).values.astype('datetime64[D]')
date_series2 = pd.to_datetime(date_series2).values.astype('datetime64[D]')
# Building a list with working days calculations between the two dates
wd_list = [handle_working_day_calc(d1, d2) for d1, d2 in zip(date_series1, date_series2)]
return wd_list
# Calculating working days between purchasing, delivering and estimated delivery
purchasing = df_orders_filt['order_purchase_timestamp']
delivered = df_orders_filt['order_delivered_customer_date']
estimated = df_orders_filt['order_estimated_delivery_date']
df_orders_filt['time_to_delivery'] = calc_working_days(purchasing, delivered, convert=True)
df_orders_filt['diff_estimated_delivery'] = calc_working_days(estimated, delivered, convert=True)
# Grouping data by state
states_avg_grouped = df_orders_filt.groupby(by='customer_state', as_index=False).mean()
states_freight_paid = states_avg_grouped.loc[:, ['customer_state', 'freight_value']]
states_time_to_delivery = states_avg_grouped.loc[:, ['customer_state', 'time_to_delivery']]
states_estimated_delivery = states_avg_grouped.loc[:, ['customer_state', 'diff_estimated_delivery']]
# Sorting data
states_freight_paid = states_freight_paid.sort_values(by='freight_value', ascending=False)
states_time_to_delivery = states_time_to_delivery.sort_values(by='time_to_delivery', ascending=False)
states_estimated_delivery = states_estimated_delivery.sort_values(by='diff_estimated_delivery')
fig, axs = plt.subplots(3, 3, figsize=(15, 10))
# Plot Pack 01 - Freight value paid on states
sns.barplot(x='freight_value', y='customer_state', data=states_freight_paid.head(), ax=axs[1, 0], palette='viridis')
axs[1, 0].set_title('Top 5 States with Highest \nAverage Freight Value', size=12, color='black')
sns.barplot(x='freight_value', y='customer_state', data=states_freight_paid.tail(), ax=axs[2, 0], palette='viridis_r')
axs[2, 0].set_title('Top 5 States with Lowest \nAverage Freight Value', size=12, color='black')
for ax in axs[1, 0], axs[2, 0]:
ax.set_xlabel('Mean Freight Value')
ax.set_xlim(0, states_freight_paid['freight_value'].max())
format_spines(ax, right_border=False)
ax.set_ylabel('')
# Annotations
axs[0, 0].text(0.50, 0.30, f'R${round(df_orders_filt.freight_value.mean(), 2)}', fontsize=45, ha='center')
axs[0, 0].text(0.50, 0.12, 'is the mean value of freight paid', fontsize=12, ha='center')
axs[0, 0].text(0.50, 0.00, 'for online shopping', fontsize=12, ha='center')
axs[0, 0].axis('off')
# Plot Pack 02 - Time to delivery on states
sns.barplot(x='time_to_delivery', y='customer_state', data=states_time_to_delivery.head(), ax=axs[1, 1], palette='viridis')
axs[1, 1].set_title('Top 5 States with Highest \nAverage Time to Delivery', size=12, color='black')
sns.barplot(x='time_to_delivery', y='customer_state', data=states_time_to_delivery.tail(), ax=axs[2, 1], palette='viridis_r')
axs[2, 1].set_title('Top 5 States with Lowest \nAverage Time do Delivery', size=12, color='black')
for ax in axs[1, 1], axs[2, 1]:
ax.set_xlabel('Time to Delivery')
ax.set_xlim(0, states_time_to_delivery['time_to_delivery'].max())
format_spines(ax, right_border=False)
ax.set_ylabel('')
# Annotations
axs[0, 1].text(0.40, 0.30, f'{int(df_orders_filt.time_to_delivery.mean())}', fontsize=45, ha='center')
axs[0, 1].text(0.60, 0.30, 'days', fontsize=12, ha='center')
axs[0, 1].text(0.50, 0.12, 'is the average delay for delivery', fontsize=12, ha='center')
axs[0, 1].text(0.50, 0.00, 'for online shopping', fontsize=12, ha='center')
axs[0, 1].axis('off')
# Plot Pack 03 - Differnece between delivered and estimated on states
sns.barplot(x='diff_estimated_delivery', y='customer_state', data=states_estimated_delivery.head(), ax=axs[1, 2], palette='viridis')
axs[1, 2].set_title('Top 5 States where Delivery is \nReally Fast Comparing to Estimated', size=12, color='black')
sns.barplot(x='diff_estimated_delivery', y='customer_state', data=states_estimated_delivery.tail(), ax=axs[2, 2], palette='viridis_r')
axs[2, 2].set_title('Top 5 States where Delivery is \nNot So Fast Comparing to Estimated', size=12, color='black')
for ax in axs[1, 2], axs[2, 2]:
ax.set_xlabel('Days Between Delivery and Estimated')
ax.set_xlim(states_estimated_delivery['diff_estimated_delivery'].min()-1,
states_estimated_delivery['diff_estimated_delivery'].max()+1)
format_spines(ax, right_border=False)
ax.set_ylabel('')
# Annotations
axs[0, 2].text(0.40, 0.30, f'{int(df_orders_filt.diff_estimated_delivery.mean())}', fontsize=45, ha='center')
axs[0, 2].text(0.60, 0.30, 'days', fontsize=12, ha='center')
axs[0, 2].text(0.50, 0.12, 'is the average difference between', fontsize=12, ha='center')
axs[0, 2].text(0.50, 0.00, 'delivery and estimated date', fontsize=12, ha='center')
axs[0, 2].axis('off')
plt.suptitle('E-Commerce on Brazilian States', size=16)
plt.tight_layout()
plt.show()
def donut_plot(df, col, ax, label_names=None, text='', colors=['crimson', 'navy'], circle_radius=0.8,
title=f'Gráfico de Rosca', flag_ruido=0):
"""
Etapas:
1. definição de funções úteis para mostrar rótulos em valor absoluto e porcentagem
2. criação de figura e círculo central de raio pré-definido
3. plotagem do gráfico de pizza e adição do círculo central
4. configuração final da plotagem
Argumentos:
df -- DataFrame alvo da análise [pandas.DataFrame]
col -- coluna do DataFrame a ser analisada [string]
label_names -- nomes customizados a serem plotados como labels [list]
text -- texto central a ser posicionado [string / default: '']
colors -- cores das entradas [list / default: ['crimson', 'navy']]
figsize -- dimensões da plotagem [tupla / default: (8, 8)]
circle_radius -- raio do círculo central [float / default: 0.8]
Retorno:
None
"""
# Retorno dos valores e definição da figura
values = df[col].value_counts().values
if label_names is None:
label_names = df[col].value_counts().index
# Verificando parâmetro de supressão de alguma categoria da análise
if flag_ruido > 0:
values = values[:-flag_ruido]
label_names = label_names[:-flag_ruido]
# Plotando gráfico de rosca
center_circle = plt.Circle((0, 0), circle_radius, color='white')
ax.pie(values, labels=label_names, colors=colors, autopct=make_autopct(values))
ax.add_artist(center_circle)
# Configurando argumentos do texto central
kwargs = dict(size=20, fontweight='bold', va='center')
ax.text(0, 0, text, ha='center', **kwargs)
ax.set_title(title, size=14, color='dimgrey')
def make_autopct(values):
"""
Etapas:
1. definição de função para formatação dos rótulos
Argumentos:
values -- valores extraídos da função value_counts() da coluna de análise [list]
Retorno:
my_autopct -- string formatada para plotagem dos rótulos
"""
def my_autopct(pct):
total = sum(values)
val = int(round(pct * total / 100.0))
return '{p:.1f}%\n({v:d})'.format(p=pct, v=val)
return my_autopct
# Grouping data
df_orders_pay = df_orders_filt.merge(olist_order_payments, how='left', on='order_id')
# Creating figure
fig = plt.figure(constrained_layout=True, figsize=(15, 12))
# Axis definition
gs = GridSpec(2, 2, figure=fig)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[1, :])
# Plot 1 - Payment types in a donut chart
colors = ['darkslateblue', 'cornflowerblue', 'silver', 'darkviolet', 'crimson']
label_names = df_orders_pay['payment_type'].value_counts().index
donut_plot(df_orders_pay, col='payment_type', ax=ax1, label_names=label_names, colors=colors,
title='Count of Transactions by Payment Type', text=f'{len(df_orders_pay)}\npayments \nregistered')
# Plot 2 - Payment installments
single_countplot(df_orders_pay, ax=ax2, y='payment_installments')
ax2.set_title('A Distribution of Payment Installments', color='dimgrey', size=12)
# Plot 3 - Evolution of payment types
payment_evl = df_orders_pay.groupby(by=['order_purchase_year_month', 'payment_type'], as_index=False).count()
payment_evl = payment_evl.loc[:, ['order_purchase_year_month', 'payment_type', 'order_id']]
payment_evl = payment_evl.sort_values(by=['order_purchase_year_month', 'order_id'], ascending=[True, False])
sns.lineplot(x='order_purchase_year_month', y='order_id', data=payment_evl, ax=ax3, hue='payment_type',
style='payment_type', size='payment_type', palette=colors, marker='o')
format_spines(ax3, right_border=False)
ax3.set_title('Evolution of Payment Types in Brazilian E-Commerce', size=12, color='dimgrey')
plt.show()
df_comments = olist_order_reviews.loc[:, ['review_score', 'review_comment_message']]
df_comments = df_comments.dropna(subset=['review_comment_message'])
df_comments = df_comments.reset_index(drop=True)
print(f'Dataset shape: {df_comments.shape}')
df_comments.columns = ['score', 'comment']
df_comments.head()
Dataset shape: (40977, 2)
| score | comment | |
|---|---|---|
| 0 | 5 | Recebi bem antes do prazo estipulado. |
| 1 | 5 | Parabéns lojas lannister adorei comprar pela I... |
| 2 | 4 | aparelho eficiente. no site a marca do aparelh... |
| 3 | 4 | Mas um pouco ,travando...pelo valor ta Boa.\r\n |
| 4 | 5 | Vendedor confiável, produto ok e entrega antes... |
정규표현식
def find_patterns(re_pattern, text_list):
"""
Args:
---------
re_pattern: regular expression pattern to be used on search [type: string]
text_list: list with text strings [type: list]
Returns:
positions_dict: python dictionary with key-value pars as below:
text_idx: [(start_pattern1, end_pattern1), (start_pattern1, end_pattern2), ... (start_n, end_n)]
"""
# Compiling the Regular Expression passed as a arg
p = re.compile(re_pattern)
positions_dict = {}
i = 0
for c in text_list:
match_list = []
iterator = p.finditer(c)
for match in iterator:
match_list.append(match.span())
control_key = f'Text idx {i}'
if len(match_list) == 0:
pass
else:
positions_dict[control_key] = match_list
i += 1
"""p = '[R]{0,1}\$[ ]{0,}\d+(,|\.)\d+'
pattern_dict = find_patterns(p, reviews_breakline)
print(len(pattern_dict))
pattern_dict
for idx in [int(c.split(' ')[-1]) for c in list(pattern_dict.keys())]:
print(f'{reviews_breakline[idx]}\n')"""
return positions_dict
def print_step_result(text_list_before, text_list_after, idx_list):
"""
Args:
----------
text_list_before: list object with text content before transformation [type: list]
text_list_after: list object with text content after transformation [type: list]
idx_list: list object with indexes to be printed [type: list]
"""
# Iterating over string examples
i = 1
for idx in idx_list:
print(f'--- Text {i} ---\n')
print(f'Before: \n{text_list_before[idx]}\n')
print(f'After: \n{text_list_after[idx]}\n')
i += 1
Breakline and Carriage Return
def re_breakline(text_list):
"""
Args:
----------
text_list: list object with text content to be prepared [type: list]
"""
# Applying regex
return [re.sub('[\n\r]', ' ', r) for r in text_list]
# Creating a list of comment reviews
reviews = list(df_comments['comment'].values)
# Applying RegEx
reviews_breakline = re_breakline(reviews)
df_comments['re_breakline'] = reviews_breakline
# Verifying results
print_step_result(reviews, reviews_breakline, idx_list=[48])
--- Text 1 --- Before: Estava faltando apenas um produto, eu recebi hoje , muito obrigada! Tudo certo! Att Elenice. After: Estava faltando apenas um produto, eu recebi hoje , muito obrigada! Tudo certo! Att Elenice.
Sites and Hiperlinks
def re_hiperlinks(text_list):
"""
Args:
----------
text_list: list object with text content to be prepared [type: list]
"""
# Applying regex
pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
return [re.sub(pattern, ' link ', r) for r in text_list]
# Applying RegEx
reviews_hiperlinks = re_hiperlinks(reviews_breakline)
df_comments['re_hiperlinks'] = reviews_hiperlinks
# Verifying results
print_step_result(reviews_breakline, reviews_hiperlinks, idx_list=[10796, 12782])
--- Text 1 --- Before: O produto chegou a ser entregue, porém foi recusado, poisja havia cancelado por ter comprado o tamanho errado! After: O produto chegou a ser entregue, porém foi recusado, poisja havia cancelado por ter comprado o tamanho errado! --- Text 2 --- Before: Muito obrigada lannister After: Muito obrigada lannister
Dates
def re_dates(text_list):
"""
Args:
----------
text_list: list object with text content to be prepared [type: list]
"""
# Applying regex
pattern = '([0-2][0-9]|(3)[0-1])(\/|\.)(((0)[0-9])|((1)[0-2]))(\/|\.)\d{2,4}'
return [re.sub(pattern, ' data ', r) for r in text_list]
# Applying RegEx
reviews_dates = re_dates(reviews_hiperlinks)
df_comments['re_dates'] = reviews_dates
# Verifying results
print_step_result(reviews_hiperlinks, reviews_dates, idx_list=[26665, 41497, 41674])
--- Text 1 --- Before: Ainda não chegou , vou esperar até o dia 26, prazo que me foi dado. Duas estrelas, só por enquanto. After: Ainda não chegou , vou esperar até o dia 26, prazo que me foi dado. Duas estrelas, só por enquanto. --- Text 2 ---
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-236-e253b37471ef> in <module> 4 5 # Verifying results ----> 6 print_step_result(reviews_hiperlinks, reviews_dates, idx_list=[26665, 41497, 41674]) <ipython-input-221-e0bb0a4b8403> in print_step_result(text_list_before, text_list_after, idx_list) 49 for idx in idx_list: 50 print(f'--- Text {i} ---\n') ---> 51 print(f'Before: \n{text_list_before[idx]}\n') 52 print(f'After: \n{text_list_after[idx]}\n') 53 i += 1 IndexError: list index out of range
Money
def re_money(text_list):
"""
Args:
----------
text_list: list object with text content to be prepared [type: list]
"""
# Applying regex
pattern = '[R]{0,1}\$[ ]{0,}\d+(,|\.)\d+'
return [re.sub(pattern, ' dinheiro ', r) for r in text_list]
# Applying RegEx
reviews_money = re_money(reviews_dates)
df_comments['re_money'] = reviews_money
# Verifying results
print_step_result(reviews_dates, reviews_money, idx_list=[26020, 33297, 32998])
--- Text 1 --- Before: Produto bom e chegou no prazo estipulado! After: Produto bom e chegou no prazo estipulado! --- Text 2 --- Before: muito satisfeita After: muito satisfeita --- Text 3 --- Before: embora o prazo seja de 14 dias úteis, o produto comprado no dia 09/05 só veio chegar no dia 2/6 e alem de chegar super atrasado ainda veio o produto errado. After: embora o prazo seja de 14 dias úteis, o produto comprado no dia 09/05 só veio chegar no dia 2/6 e alem de chegar super atrasado ainda veio o produto errado.
Numbers
def re_numbers(text_list):
"""
Args:
----------
text_series: list object with text content to be prepared [type: list]
"""
# Applying regex
return [re.sub('[0-9]+', ' numero ', r) for r in text_list]
# Applying RegEx
reviews_numbers = re_numbers(reviews_money)
df_comments['re_numbers'] = reviews_numbers
# Verifying results
print_step_result(reviews_money, reviews_numbers, idx_list=[68])
--- Text 1 --- Before: Comprei o produto dia 25 de fevereiro e hoje dia 29 de marco não fora entregue na minha residência. Não sei se os correios desse Brasil e péssimo ou foi a própria loja que demorou postar. After: Comprei o produto dia numero de fevereiro e hoje dia numero de marco não fora entregue na minha residência. Não sei se os correios desse Brasil e péssimo ou foi a própria loja que demorou postar.
negation
def re_negation(text_list):
"""
Args:
----------
text_series: list object with text content to be prepared [type: list]
"""
# Applying regex
return [re.sub('([nN][ãÃaA][oO]|[ñÑ]| [nN] )', ' negação ', r) for r in text_list]
# Applying RegEx
reviews_negation = re_negation(reviews_numbers)
df_comments['re_negation'] = reviews_negation
# Verifying results
print_step_result(reviews_numbers, reviews_negation, idx_list=[4783, 4627, 4856, 4904])
--- Text 1 --- Before: Ótimo relógio estou bastante satisfeito recomendo a todos essa loja está de parabéns obrigado After: Ótimo relógio estou bastante satisfeito recomendo a todos essa loja está de parabéns obrigado --- Text 2 --- Before: Envio muito rápido e chegou muito rápido também. After: Envio muito rápido e chegou muito rápido também. --- Text 3 --- Before: Gostei do produto. Recomendo. After: Gostei do produto. Recomendo. --- Text 4 --- Before: Igualzinho o que eu pedi. Lindo After: Igualzinho o que eu pedi. Lindo
Special Characters
def re_special_chars(text_list):
"""
Args:
----------
text_series: list object with text content to be prepared [type: list]
"""
# Applying regex
return [re.sub('\W', ' ', r) for r in text_list]
# Applying RegEx
reviews_special_chars = re_special_chars(reviews_negation)
df_comments['re_special_chars'] = reviews_special_chars
# Verifying results
print_step_result(reviews_negation, reviews_special_chars, idx_list=[45, 135, 234])
--- Text 1 --- Before: Este foi o pedido Balde Com numero Peças - Blocos De Montar numero un - dinheiro cada ( negação FOI ENTREGUE) Vendido e entregue targaryen Tapete de Eva Nº Letras numero Peças Crianças numero un - dinheiro (ESTE FOI ENTREG After: Este foi o pedido Balde Com numero Peças Blocos De Montar numero un dinheiro cada negação FOI ENTREGUE Vendido e entregue targaryen Tapete de Eva Nº Letras numero Peças Crianças numero un dinheiro ESTE FOI ENTREG --- Text 2 --- Before: Produto negação recomendo negação ler em nenhum dispositivo After: Produto negação recomendo negação ler em nenhum dispositivo --- Text 3 --- Before: Produto de ótima qualidade. Adorei! After: Produto de ótima qualidade Adorei
Additional Whitespaces
def re_whitespaces(text_list):
"""
Args:
----------
text_series: list object with text content to be prepared [type: list]
"""
# Applying regex
white_spaces = [re.sub('\s+', ' ', r) for r in text_list]
white_spaces_end = [re.sub('[ \t]+$', '', r) for r in white_spaces]
return white_spaces_end
# Applying RegEx
reviews_whitespaces = re_whitespaces(reviews_special_chars)
df_comments['re_whitespaces'] = reviews_whitespaces
# Verifying results
print_step_result(reviews_special_chars, reviews_whitespaces, idx_list=[3, 4, -1])
--- Text 1 --- Before: Mas um pouco travando pelo valor ta Boa After: Mas um pouco travando pelo valor ta Boa --- Text 2 --- Before: Vendedor confiável produto ok e entrega antes do prazo After: Vendedor confiável produto ok e entrega antes do prazo --- Text 3 --- Before: meu produto chegou e ja tenho que devolver pois está com defeito negação segurar carga After: meu produto chegou e ja tenho que devolver pois está com defeito negação segurar carga